aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG14
-rw-r--r--MANIFEST.in1
-rw-r--r--Makefile11
-rw-r--r--README.md128
-rw-r--r--README.txt146
-rwxr-xr-xdevscripts/make_readme.py2
-rw-r--r--devscripts/prepare_manpage.py20
-rwxr-xr-xdevscripts/release.sh20
-rw-r--r--docs/.gitignore1
-rw-r--r--docs/Makefile177
-rw-r--r--docs/conf.py71
-rw-r--r--docs/index.rst23
-rw-r--r--docs/module_guide.rst67
-rw-r--r--test/helper.py75
-rw-r--r--test/test_InfoExtractor.py44
-rw-r--r--test/test_YoutubeDL.py94
-rw-r--r--test/test_age_restriction.py2
-rw-r--r--test/test_all_urls.py68
-rw-r--r--test/test_download.py36
-rw-r--r--test/test_playlists.py155
-rw-r--r--test/test_subtitles.py2
-rw-r--r--test/test_utils.py34
-rw-r--r--test/test_youtube_lists.py16
-rwxr-xr-xyoutube-dlbin380523 -> 482247 bytes
-rw-r--r--youtube-dl.1155
-rw-r--r--youtube-dl.bash-completion2
-rwxr-xr-xyoutube_dl/InfoExtractors.py4
-rwxr-xr-x[-rw-r--r--]youtube_dl/YoutubeDL.py252
-rw-r--r--youtube_dl/__init__.py90
-rw-r--r--youtube_dl/downloader/common.py5
-rw-r--r--youtube_dl/downloader/f4m.py12
-rw-r--r--youtube_dl/downloader/hls.py6
-rw-r--r--youtube_dl/downloader/http.py29
-rw-r--r--youtube_dl/downloader/rtmp.py58
-rw-r--r--youtube_dl/extractor/__init__.py124
-rw-r--r--youtube_dl/extractor/academicearth.py8
-rw-r--r--youtube_dl/extractor/addanime.py31
-rw-r--r--youtube_dl/extractor/aftonbladet.py66
-rw-r--r--youtube_dl/extractor/aol.py65
-rw-r--r--youtube_dl/extractor/appletrailers.py29
-rw-r--r--youtube_dl/extractor/ard.py33
-rw-r--r--youtube_dl/extractor/arte.py192
-rw-r--r--youtube_dl/extractor/auengine.py21
-rw-r--r--youtube_dl/extractor/bandcamp.py46
-rw-r--r--youtube_dl/extractor/bbccouk.py18
-rw-r--r--youtube_dl/extractor/bilibili.py106
-rw-r--r--youtube_dl/extractor/blinkx.py30
-rw-r--r--youtube_dl/extractor/bliptv.py186
-rw-r--r--youtube_dl/extractor/bloomberg.py36
-rw-r--r--youtube_dl/extractor/br.py139
-rw-r--r--youtube_dl/extractor/breakcom.py11
-rw-r--r--youtube_dl/extractor/brightcove.py8
-rw-r--r--youtube_dl/extractor/byutv.py48
-rw-r--r--youtube_dl/extractor/c56.py29
-rw-r--r--youtube_dl/extractor/canal13cl.py48
-rw-r--r--youtube_dl/extractor/canalc2.py28
-rw-r--r--youtube_dl/extractor/canalplus.py83
-rw-r--r--youtube_dl/extractor/cbsnews.py87
-rw-r--r--youtube_dl/extractor/ceskatelevize.py126
-rw-r--r--youtube_dl/extractor/cinemassacre.py128
-rw-r--r--youtube_dl/extractor/clipfish.py37
-rw-r--r--youtube_dl/extractor/clipsyndicate.py23
-rw-r--r--youtube_dl/extractor/clubic.py58
-rw-r--r--youtube_dl/extractor/cmt.py24
-rw-r--r--youtube_dl/extractor/cnet.py75
-rw-r--r--youtube_dl/extractor/cnn.py10
-rw-r--r--youtube_dl/extractor/collegehumor.py21
-rw-r--r--youtube_dl/extractor/comedycentral.py143
-rw-r--r--youtube_dl/extractor/common.py75
-rw-r--r--youtube_dl/extractor/condenast.py30
-rw-r--r--youtube_dl/extractor/crunchyroll.py32
-rw-r--r--youtube_dl/extractor/cspan.py49
-rw-r--r--youtube_dl/extractor/dailymotion.py34
-rw-r--r--youtube_dl/extractor/daum.py28
-rw-r--r--youtube_dl/extractor/depositfiles.py60
-rw-r--r--youtube_dl/extractor/discovery.py5
-rw-r--r--youtube_dl/extractor/divxstage.py27
-rw-r--r--youtube_dl/extractor/ehow.py43
-rw-r--r--youtube_dl/extractor/empflix.py54
-rw-r--r--youtube_dl/extractor/engadget.py43
-rw-r--r--youtube_dl/extractor/extremetube.py40
-rw-r--r--youtube_dl/extractor/facebook.py79
-rw-r--r--youtube_dl/extractor/fc2.py60
-rw-r--r--youtube_dl/extractor/firstpost.py29
-rw-r--r--youtube_dl/extractor/fivemin.py88
-rw-r--r--youtube_dl/extractor/fourtube.py2
-rw-r--r--youtube_dl/extractor/franceculture.py77
-rw-r--r--youtube_dl/extractor/francetv.py22
-rw-r--r--youtube_dl/extractor/funnyordie.py63
-rw-r--r--youtube_dl/extractor/gamekings.py13
-rw-r--r--youtube_dl/extractor/gamespot.py11
-rw-r--r--youtube_dl/extractor/gdcvault.py134
-rw-r--r--youtube_dl/extractor/generic.py426
-rw-r--r--youtube_dl/extractor/googlesearch.py2
-rw-r--r--youtube_dl/extractor/hentaistigma.py42
-rw-r--r--youtube_dl/extractor/huffpost.py3
-rw-r--r--youtube_dl/extractor/iconosquare.py (renamed from youtube_dl/extractor/statigram.py)8
-rw-r--r--youtube_dl/extractor/ign.py109
-rw-r--r--youtube_dl/extractor/infoq.py54
-rw-r--r--youtube_dl/extractor/instagram.py68
-rw-r--r--youtube_dl/extractor/iprima.py52
-rw-r--r--youtube_dl/extractor/ivi.py12
-rw-r--r--youtube_dl/extractor/jukebox.py75
-rw-r--r--youtube_dl/extractor/justintv.py83
-rw-r--r--youtube_dl/extractor/keezmovies.py26
-rw-r--r--youtube_dl/extractor/kickstarter.py46
-rw-r--r--youtube_dl/extractor/kontrtube.py24
-rw-r--r--youtube_dl/extractor/ku6.py35
-rw-r--r--youtube_dl/extractor/lifenews.py41
-rw-r--r--youtube_dl/extractor/liveleak.py55
-rw-r--r--youtube_dl/extractor/lynda.py47
-rw-r--r--youtube_dl/extractor/mailru.py86
-rw-r--r--youtube_dl/extractor/mdr.py22
-rw-r--r--youtube_dl/extractor/metacafe.py164
-rw-r--r--youtube_dl/extractor/metacritic.py3
-rw-r--r--youtube_dl/extractor/mit.py135
-rw-r--r--youtube_dl/extractor/mixcloud.py60
-rw-r--r--youtube_dl/extractor/mooshare.py2
-rw-r--r--youtube_dl/extractor/morningstar.py47
-rw-r--r--youtube_dl/extractor/motorsport.py63
-rw-r--r--youtube_dl/extractor/moviezine.py45
-rw-r--r--youtube_dl/extractor/movshare.py27
-rw-r--r--youtube_dl/extractor/mpora.py6
-rw-r--r--youtube_dl/extractor/mtv.py40
-rw-r--r--youtube_dl/extractor/musicplayon.py75
-rw-r--r--youtube_dl/extractor/myvideo.py101
-rw-r--r--youtube_dl/extractor/naver.py34
-rw-r--r--youtube_dl/extractor/nba.py8
-rw-r--r--youtube_dl/extractor/nbc.py91
-rw-r--r--youtube_dl/extractor/ndr.py31
-rw-r--r--youtube_dl/extractor/newstube.py87
-rw-r--r--youtube_dl/extractor/nfb.py18
-rw-r--r--youtube_dl/extractor/niconico.py81
-rw-r--r--youtube_dl/extractor/ninegag.py61
-rw-r--r--youtube_dl/extractor/noco.py106
-rw-r--r--youtube_dl/extractor/normalboots.py68
-rw-r--r--youtube_dl/extractor/novamov.py48
-rw-r--r--youtube_dl/extractor/nowness.py9
-rw-r--r--youtube_dl/extractor/nowvideo.py60
-rw-r--r--youtube_dl/extractor/nrk.py145
-rw-r--r--youtube_dl/extractor/ntv.py149
-rw-r--r--youtube_dl/extractor/nuvid.py48
-rw-r--r--youtube_dl/extractor/nytimes.py77
-rw-r--r--youtube_dl/extractor/oe1.py40
-rw-r--r--youtube_dl/extractor/ooyala.py49
-rw-r--r--youtube_dl/extractor/orf.py11
-rw-r--r--youtube_dl/extractor/parliamentliveuk.py53
-rw-r--r--youtube_dl/extractor/pbs.py11
-rw-r--r--youtube_dl/extractor/photobucket.py81
-rw-r--r--youtube_dl/extractor/playvid.py80
-rw-r--r--youtube_dl/extractor/podomatic.py47
-rw-r--r--youtube_dl/extractor/pornhd.py77
-rw-r--r--youtube_dl/extractor/pornhub.py23
-rw-r--r--youtube_dl/extractor/prosiebensat1.py286
-rw-r--r--youtube_dl/extractor/pyvideo.py62
-rw-r--r--youtube_dl/extractor/radiofrance.py34
-rw-r--r--youtube_dl/extractor/ro220.py2
-rw-r--r--youtube_dl/extractor/roxwel.py52
-rw-r--r--youtube_dl/extractor/rtbf.py49
-rw-r--r--youtube_dl/extractor/rtlnow.py251
-rw-r--r--youtube_dl/extractor/rts.py154
-rw-r--r--youtube_dl/extractor/rtve.py84
-rw-r--r--youtube_dl/extractor/rutube.py33
-rw-r--r--youtube_dl/extractor/rutv.py194
-rw-r--r--youtube_dl/extractor/savefrom.py37
-rw-r--r--youtube_dl/extractor/scivee.py56
-rw-r--r--youtube_dl/extractor/slashdot.py24
-rw-r--r--youtube_dl/extractor/slideshare.py3
-rw-r--r--youtube_dl/extractor/slutload.py47
-rw-r--r--youtube_dl/extractor/smotri.py51
-rw-r--r--youtube_dl/extractor/soundcloud.py153
-rw-r--r--youtube_dl/extractor/space.py18
-rw-r--r--youtube_dl/extractor/spankwire.py65
-rw-r--r--youtube_dl/extractor/spiegeltv.py81
-rw-r--r--youtube_dl/extractor/spike.py15
-rw-r--r--youtube_dl/extractor/steam.py158
-rw-r--r--youtube_dl/extractor/streamcz.py22
-rw-r--r--youtube_dl/extractor/swrmediathek.py104
-rw-r--r--youtube_dl/extractor/syfy.py28
-rw-r--r--youtube_dl/extractor/tagesschau.py79
-rw-r--r--youtube_dl/extractor/teachertube.py93
-rw-r--r--youtube_dl/extractor/teachingchannel.py33
-rw-r--r--youtube_dl/extractor/teamcoco.py39
-rw-r--r--youtube_dl/extractor/ted.py224
-rw-r--r--youtube_dl/extractor/testurl.py68
-rw-r--r--youtube_dl/extractor/tf1.py40
-rw-r--r--youtube_dl/extractor/theplatform.py41
-rw-r--r--youtube_dl/extractor/tinypic.py2
-rw-r--r--youtube_dl/extractor/tlc.py60
-rw-r--r--youtube_dl/extractor/toypics.py75
-rw-r--r--youtube_dl/extractor/trutube.py44
-rw-r--r--youtube_dl/extractor/tube8.py92
-rw-r--r--youtube_dl/extractor/tvigle.py84
-rw-r--r--youtube_dl/extractor/udemy.py164
-rw-r--r--youtube_dl/extractor/urort.py61
-rw-r--r--youtube_dl/extractor/ustream.py22
-rw-r--r--youtube_dl/extractor/veoh.py124
-rw-r--r--youtube_dl/extractor/vesti.py131
-rw-r--r--youtube_dl/extractor/vevo.py77
-rw-r--r--youtube_dl/extractor/vh1.py124
-rw-r--r--youtube_dl/extractor/vice.py38
-rw-r--r--youtube_dl/extractor/videobam.py81
-rw-r--r--youtube_dl/extractor/videodetective.py22
-rw-r--r--youtube_dl/extractor/videolecturesnet.py70
-rw-r--r--youtube_dl/extractor/videott.py58
-rw-r--r--youtube_dl/extractor/videoweed.py26
-rw-r--r--youtube_dl/extractor/viki.py45
-rw-r--r--youtube_dl/extractor/vimeo.py150
-rw-r--r--youtube_dl/extractor/vine.py74
-rw-r--r--youtube_dl/extractor/vk.py25
-rw-r--r--youtube_dl/extractor/vube.py88
-rw-r--r--youtube_dl/extractor/vuclip.py66
-rw-r--r--youtube_dl/extractor/washingtonpost.py103
-rw-r--r--youtube_dl/extractor/wat.py52
-rw-r--r--youtube_dl/extractor/wdr.py224
-rw-r--r--youtube_dl/extractor/weibo.py30
-rw-r--r--youtube_dl/extractor/wimp.py38
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py45
-rw-r--r--youtube_dl/extractor/xbef.py50
-rw-r--r--youtube_dl/extractor/xhamster.py136
-rw-r--r--youtube_dl/extractor/xnxx.py46
-rw-r--r--youtube_dl/extractor/xtube.py102
-rw-r--r--youtube_dl/extractor/xvideos.py17
-rw-r--r--youtube_dl/extractor/yahoo.py90
-rw-r--r--youtube_dl/extractor/youporn.py32
-rw-r--r--youtube_dl/extractor/youtube.py399
-rw-r--r--youtube_dl/extractor/zdf.py70
-rw-r--r--youtube_dl/jsinterp.py116
-rw-r--r--youtube_dl/postprocessor/__init__.py4
-rw-r--r--youtube_dl/postprocessor/atomicparsley.py56
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py25
-rw-r--r--youtube_dl/postprocessor/xattrpp.py3
-rw-r--r--youtube_dl/utils.py266
-rw-r--r--youtube_dl/version.py2
234 files changed, 11716 insertions, 3385 deletions
diff --git a/CHANGELOG b/CHANGELOG
deleted file mode 100644
index 3fa1167..0000000
--- a/CHANGELOG
+++ /dev/null
@@ -1,14 +0,0 @@
-2013.01.02 Codename: GIULIA
-
- * Add support for ComedyCentral clips <nto>
- * Corrected Vimeo description fetching <Nick Daniels>
- * Added the --no-post-overwrites argument <Barbu Paul - Gheorghe>
- * --verbose offers more environment info
- * New info_dict field: uploader_id
- * New updates system, with signature checking
- * New IEs: NBA, JustinTV, FunnyOrDie, TweetReel, Steam, Ustream
- * Fixed IEs: BlipTv
- * Fixed for Python 3 IEs: Xvideo, Youku, XNXX, Dailymotion, Vimeo, InfoQ
- * Simplified IEs and test code
- * Various (Python 3 and other) fixes
- * Revamped and expanded tests
diff --git a/MANIFEST.in b/MANIFEST.in
index 8f8af7a..d43cc1f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,3 +3,4 @@ include test/*.py
include test/*.json
include youtube-dl.bash-completion
include youtube-dl.1
+recursive-include docs Makefile conf.py *.rst
diff --git a/Makefile b/Makefile
index c6d0993..c079761 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
all: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion
clean:
- rm -rf youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz
+ rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz
cleanall: clean
rm -f youtube-dl youtube-dl.exe
@@ -55,7 +55,9 @@ README.txt: README.md
pandoc -f markdown -t plain README.md -o README.txt
youtube-dl.1: README.md
- pandoc -s -f markdown -t man README.md -o youtube-dl.1
+ python devscripts/prepare_manpage.py >youtube-dl.1.temp.md
+ pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1
+ rm -f youtube-dl.1.temp.md
youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in
python devscripts/bash-completion.py
@@ -72,8 +74,9 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
--exclude '__pycache' \
--exclude '.git' \
--exclude 'testdata' \
+ --exclude 'docs/_build' \
-- \
- bin devscripts test youtube_dl \
- CHANGELOG LICENSE README.md README.txt \
+ bin devscripts test youtube_dl docs \
+ LICENSE README.md README.txt \
Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion setup.py \
youtube-dl
diff --git a/README.md b/README.md
index 35876d9..2bea609 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,24 @@
-% YOUTUBE-DL(1)
-
-# NAME
youtube-dl - download videos from youtube.com or other video platforms
# SYNOPSIS
**youtube-dl** [OPTIONS] URL [URL...]
+# INSTALLATION
+
+To install it right away for all UNIX users (Linux, OS X, etc.), type:
+
+ sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+ sudo chmod a+x /usr/local/bin/youtube-dl
+
+If you do not have curl, you can alternatively use a recent wget:
+
+ sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl
+ sudo chmod a+x /usr/local/bin/youtube-dl
+
+Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
+
+Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html .
+
# DESCRIPTION
**youtube-dl** is a small command-line program to download videos from
YouTube.com and a few more sites. It requires the Python interpreter, version
@@ -20,7 +33,7 @@ which means you can modify it, redistribute it or use it however you like.
sure that you have sufficient permissions
(run with sudo if needed)
-i, --ignore-errors continue on download errors, for example to
- to skip unavailable videos in a playlist
+ skip unavailable videos in a playlist
--abort-on-error Abort downloading of further videos (in the
playlist or the command line) if an error
occurs
@@ -28,6 +41,9 @@ which means you can modify it, redistribute it or use it however you like.
--user-agent UA specify a custom user agent
--referer REF specify a custom referer, use if the video
access is restricted to one domain
+ --add-header FIELD:VALUE specify a custom HTTP header and its value,
+ separated by a colon ':'. You can use this
+ option multiple times
--list-extractors List all supported extractors and the URLs
they would handle
--extractor-descriptions Output descriptions of all supported
@@ -36,6 +52,9 @@ which means you can modify it, redistribute it or use it however you like.
an empty string (--proxy "") for direct
connection
--no-check-certificate Suppress HTTPS certificate validation.
+ --prefer-insecure Use an unencrypted connection to retrieve
+ information about the video. (Currently
+ supported only for YouTube)
--cache-dir DIR Location in the filesystem where youtube-dl
can store some downloaded information
permanently. By default $XDG_CACHE_HOME
@@ -59,6 +78,7 @@ which means you can modify it, redistribute it or use it however you like.
configuration in ~/.config/youtube-dl.conf
(%APPDATA%/youtube-dl/config.txt on
Windows)
+ --encoding ENCODING Force the specified encoding (experimental)
## Video Selection:
--playlist-start NUMBER playlist video to start at (default is 1)
@@ -124,8 +144,12 @@ which means you can modify it, redistribute it or use it however you like.
video id, %(playlist)s for the playlist the
video is in, %(playlist_index)s for the
position in the playlist and %% for a
- literal percent. Use - to output to stdout.
- Can also be used to download to a different
+ literal percent. %(height)s and %(width)s
+ for the width and height of the video
+ format. %(resolution)s for a textual
+ description of the resolution of the video
+ format. Use - to output to stdout. Can also
+ be used to download to a different
directory, for example with -o '/my/downloa
ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in
@@ -159,6 +183,7 @@ which means you can modify it, redistribute it or use it however you like.
## Verbosity / Simulation Options:
-q, --quiet activates quiet mode
+ --no-warnings Ignore warnings
-s, --simulate do not download the video and do not write
anything to disk
--skip-download do not download the video
@@ -170,7 +195,9 @@ which means you can modify it, redistribute it or use it however you like.
--get-duration simulate, quiet but print video length
--get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format
- -j, --dump-json simulate, quiet but print JSON information
+ -j, --dump-json simulate, quiet but print JSON information.
+ See --output for a description of available
+ keys.
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar
@@ -187,9 +214,9 @@ which means you can modify it, redistribute it or use it however you like.
preference using slashes: "-f 22/17/18".
"-f mp4" and "-f flv" are also supported.
You can also use the special names "best",
- "bestaudio", "worst", and "worstaudio". By
- default, youtube-dl will pick the best
- quality.
+ "bestvideo", "bestaudio", "worst",
+ "worstvideo" and "worstaudio". By default,
+ youtube-dl will pick the best quality.
--all-formats download all available video formats
--prefer-free-formats prefer free video formats unless a specific
one is requested
@@ -236,6 +263,7 @@ which means you can modify it, redistribute it or use it however you like.
default
--embed-subs embed subtitles in the video (only for mp4
videos)
+ --embed-thumbnail embed thumbnail in the audio as cover art
--add-metadata write metadata to the video file
--xattrs write metadata to the video file's xattrs
(using dublin core and xdg standards)
@@ -246,7 +274,7 @@ which means you can modify it, redistribute it or use it however you like.
# CONFIGURATION
-You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
+You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
# OUTPUT TEMPLATE
@@ -281,12 +309,14 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb
Examples:
- $ # Download only the videos uploaded in the last 6 months
- $ youtube-dl --dateafter now-6months
- $ # Download only the videos uploaded on January 1, 1970
- $ youtube-dl --date 19700101
- $ # will only download the videos uploaded in the 200x decade
- $ youtube-dl --dateafter 20000101 --datebefore 20091231
+ # Download only the videos uploaded in the last 6 months
+ $ youtube-dl --dateafter now-6months
+
+ # Download only the videos uploaded on January 1, 1970
+ $ youtube-dl --date 19700101
+
+ $ # will only download the videos uploaded in the 200x decade
+ $ youtube-dl --dateafter 20000101 --datebefore 20091231
# FAQ
@@ -355,7 +385,67 @@ If you want to create a build of youtube-dl yourself, you'll need
### Adding support for a new site
-If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py Test_Download.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
+If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
+
+1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
+2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
+3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
+ # coding: utf-8
+ from __future__ import unicode_literals
+
+ import re
+
+ from .common import InfoExtractor
+
+
+ class YourExtractorIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://yourextractor.com/watch/42',
+ 'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+ 'info_dict': {
+ 'id': '42',
+ 'ext': 'mp4',
+ 'title': 'Video title goes here',
+ # TODO more properties, either as:
+ # * A value
+ # * MD5 checksum; start the string with md5:
+ # * A regular expression; start the string with re:
+ # * Any Python type (for example int or float)
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ # TODO more code goes here, for example ...
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ # TODO more properties (see youtube_dl/extractor/common.py)
+ }
+
+
+5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done.
+7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
+8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
+9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
+
+ $ git add youtube_dl/extractor/__init__.py
+ $ git add youtube_dl/extractor/yourextractor.py
+ $ git commit -m '[yourextractor] Add new extractor'
+ $ git push origin yourextractor
+
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+
+In any case, thank you very much for your contributions!
# BUGS
@@ -381,7 +471,7 @@ If your report is shorter than two lines, it is almost certainly missing some of
For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information.
-Site support requests must contain an example URL. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.
+Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.
### Are you using the latest version?
diff --git a/README.txt b/README.txt
index 0015a74..4757a33 100644
--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,3 @@
-NAME
-====
-
youtube-dl - download videos from youtube.com or other video platforms
SYNOPSIS
@@ -8,6 +5,27 @@ SYNOPSIS
youtube-dl OPTIONS URL [URL...]
+INSTALLATION
+============
+
+To install it right away for all UNIX users (Linux, OS X, etc.), type:
+
+ sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+ sudo chmod a+x /usr/local/bin/youtube-dl
+
+If you do not have curl, you can alternatively use a recent wget:
+
+ sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl
+ sudo chmod a+x /usr/local/bin/youtube-dl
+
+Windows users can download a .exe file and place it in their home
+directory or any other location on their PATH.
+
+Alternatively, refer to the developer instructions below for how to
+check out and work with the git repository. For further options,
+including PGP signatures, see
+https://rg3.github.io/youtube-dl/download.html .
+
DESCRIPTION
===========
@@ -27,7 +45,7 @@ OPTIONS
sure that you have sufficient permissions
(run with sudo if needed)
-i, --ignore-errors continue on download errors, for example to
- to skip unavailable videos in a playlist
+ skip unavailable videos in a playlist
--abort-on-error Abort downloading of further videos (in the
playlist or the command line) if an error
occurs
@@ -35,6 +53,9 @@ OPTIONS
--user-agent UA specify a custom user agent
--referer REF specify a custom referer, use if the video
access is restricted to one domain
+ --add-header FIELD:VALUE specify a custom HTTP header and its value,
+ separated by a colon ':'. You can use this
+ option multiple times
--list-extractors List all supported extractors and the URLs
they would handle
--extractor-descriptions Output descriptions of all supported
@@ -43,6 +64,9 @@ OPTIONS
an empty string (--proxy "") for direct
connection
--no-check-certificate Suppress HTTPS certificate validation.
+ --prefer-insecure Use an unencrypted connection to retrieve
+ information about the video. (Currently
+ supported only for YouTube)
--cache-dir DIR Location in the filesystem where youtube-dl
can store some downloaded information
permanently. By default $XDG_CACHE_HOME
@@ -66,6 +90,7 @@ OPTIONS
configuration in ~/.config/youtube-dl.conf
(%APPDATA%/youtube-dl/config.txt on
Windows)
+ --encoding ENCODING Force the specified encoding (experimental)
Video Selection:
----------------
@@ -137,8 +162,12 @@ Filesystem Options:
video id, %(playlist)s for the playlist the
video is in, %(playlist_index)s for the
position in the playlist and %% for a
- literal percent. Use - to output to stdout.
- Can also be used to download to a different
+ literal percent. %(height)s and %(width)s
+ for the width and height of the video
+ format. %(resolution)s for a textual
+ description of the resolution of the video
+ format. Use - to output to stdout. Can also
+ be used to download to a different
directory, for example with -o '/my/downloa
ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
--autonumber-size NUMBER Specifies the number of digits in
@@ -174,6 +203,7 @@ Verbosity / Simulation Options:
-------------------------------
-q, --quiet activates quiet mode
+ --no-warnings Ignore warnings
-s, --simulate do not download the video and do not write
anything to disk
--skip-download do not download the video
@@ -185,7 +215,9 @@ Verbosity / Simulation Options:
--get-duration simulate, quiet but print video length
--get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format
- -j, --dump-json simulate, quiet but print JSON information
+ -j, --dump-json simulate, quiet but print JSON information.
+ See --output for a description of available
+ keys.
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar
@@ -204,9 +236,9 @@ Video Format Options:
preference using slashes: "-f 22/17/18".
"-f mp4" and "-f flv" are also supported.
You can also use the special names "best",
- "bestaudio", "worst", and "worstaudio". By
- default, youtube-dl will pick the best
- quality.
+ "bestvideo", "bestaudio", "worst",
+ "worstvideo" and "worstaudio". By default,
+ youtube-dl will pick the best quality.
--all-formats download all available video formats
--prefer-free-formats prefer free video formats unless a specific
one is requested
@@ -259,6 +291,7 @@ Post-processing Options:
default
--embed-subs embed subtitles in the video (only for mp4
videos)
+ --embed-thumbnail embed thumbnail in the audio as cover art
--add-metadata write metadata to the video file
--xattrs write metadata to the video file's xattrs
(using dublin core and xdg standards)
@@ -272,7 +305,7 @@ CONFIGURATION
You can configure youtube-dl by placing default arguments (such as
--extract-audio --no-mtime to always extract the audio and not copy the
-mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf. On
+mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl/config. On
Windows, the configuration file locations are
%APPDATA%\youtube-dl\config.txt and C:\Users\<Yourname>\youtube-dl.conf.
@@ -330,11 +363,14 @@ Videos can be filtered by their upload date using the options --date,
Examples:
-$ # Download only the videos uploaded in the last 6 months $ youtube-dl
---dateafter now-6months $ # Download only the videos uploaded on January
-1, 1970 $ youtube-dl --date 19700101 $ # will only download the videos
-uploaded in the 200x decade $ youtube-dl --dateafter 20000101
---datebefore 20091231
+ # Download only the videos uploaded in the last 6 months
+ $ youtube-dl --dateafter now-6months
+
+ # Download only the videos uploaded on January 1, 1970
+ $ youtube-dl --date 19700101
+
+ $ # will only download the videos uploaded in the 200x decade
+ $ youtube-dl --dateafter 20000101 --datebefore 20091231
FAQ
===
@@ -433,14 +469,76 @@ If you want to create a build of youtube-dl yourself, you'll need
Adding support for a new site
-If you want to add support for a new site, copy any recently modified
-file in youtube_dl/extractor, add an import in
-youtube_dl/extractor/__init__.py. Have a look at
-youtube_dl/common/extractor/common.py for possible helper methods and a
-detailed description of what your extractor should return. Don't forget
-to run the tests with
-python test/test_download.py Test_Download.test_YourExtractor! For a
-detailed tutorial, refer to this blog post.
+If you want to add support for a new site, you can follow this quick
+list (assuming your service is called yourextractor):
+
+1. Fork this repository
+2. Check out the source code with
+ git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
+3. Start a new git branch with
+ cd youtube-dl; git checkout -b yourextractor
+4. Start with this simple template and save it to
+ youtube_dl/extractor/yourextractor.py:
+
+ # coding: utf-8
+ from __future__ import unicode_literals
+
+ import re
+
+ from .common import InfoExtractor
+
+
+ class YourExtractorIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://yourextractor.com/watch/42',
+ 'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+ 'info_dict': {
+ 'id': '42',
+ 'ext': 'mp4',
+ 'title': 'Video title goes here',
+ # TODO more properties, either as:
+ # * A value
+ # * MD5 checksum; start the string with md5:
+ # * A regular expression; start the string with re:
+ # * Any Python type (for example int or float)
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ # TODO more code goes here, for example ...
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ # TODO more properties (see youtube_dl/extractor/common.py)
+ }
+
+5. Add an import in youtube_dl/extractor/__init__.py.
+6. Run python test/test_download.py TestDownload.test_YourExtractor.
+ This should fail at first, but you can continually re-run it until
+ you're done.
+7. Have a look at youtube_dl/common/extractor/common.py for possible
+ helper methods and a detailed description of what your extractor
+ should return. Add tests and code for as many as you want.
+8. If you can, check the code with pyflakes (a good idea) and pep8
+ (optional, ignore E501).
+9. When the tests pass, add the new files and commit them and push the
+ result, like this:
+
+ $ git add youtube_dl/extractor/__init__.py
+ $ git add youtube_dl/extractor/yourextractor.py
+ $ git commit -m '[yourextractor] Add new extractor'
+ $ git push origin yourextractor
+
+10. Finally, create a pull request. We'll then review and merge it.
+
+In any case, thank you very much for your contributions!
BUGS
====
diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py
index cae1fa4..70fa942 100755
--- a/devscripts/make_readme.py
+++ b/devscripts/make_readme.py
@@ -15,7 +15,7 @@ header = oldreadme[:oldreadme.index('# OPTIONS')]
footer = oldreadme[oldreadme.index('# CONFIGURATION'):]
options = helptext[helptext.index(' General Options:') + 19:]
-options = re.sub(r'^ (\w.+)$', r'## \1', options, flags=re.M)
+options = re.sub(r'(?m)^ (\w.+)$', r'## \1', options)
options = '# OPTIONS\n' + options + '\n'
with io.open(README_FILE, 'w', encoding='utf-8') as f:
diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py
new file mode 100644
index 0000000..d9c8570
--- /dev/null
+++ b/devscripts/prepare_manpage.py
@@ -0,0 +1,20 @@
+
+import io
+import os.path
+import sys
+import re
+
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+README_FILE = os.path.join(ROOT_DIR, 'README.md')
+
+with io.open(README_FILE, encoding='utf-8') as f:
+ readme = f.read()
+
+PREFIX = '%YOUTUBE-DL(1)\n\n# NAME\n'
+readme = re.sub(r'(?s)# INSTALLATION.*?(?=# DESCRIPTION)', '', readme)
+readme = PREFIX + readme
+
+if sys.version_info < (3, 0):
+ print(readme.encode('utf-8'))
+else:
+ print(readme)
diff --git a/devscripts/release.sh b/devscripts/release.sh
index 323acf8..453087e 100755
--- a/devscripts/release.sh
+++ b/devscripts/release.sh
@@ -14,14 +14,20 @@
set -e
-skip_tests=false
-if [ "$1" = '--skip-test' ]; then
- skip_tests=true
+skip_tests=true
+if [ "$1" = '--run-tests' ]; then
+ skip_tests=false
shift
fi
if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
version="$1"
+major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p')
+if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then
+ echo "$version does not start with today's date!"
+ exit 1
+fi
+
if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi
if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi
useless_files=$(find youtube_dl -type f -not -name '*.py')
@@ -39,9 +45,9 @@ fi
/bin/echo -e "\n### Changing version in version.py..."
sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py
-/bin/echo -e "\n### Committing CHANGELOG README.md and youtube_dl/version.py..."
+/bin/echo -e "\n### Committing README.md and youtube_dl/version.py..."
make README.md
-git add CHANGELOG README.md youtube_dl/version.py
+git add README.md youtube_dl/version.py
git commit -m "release $version"
/bin/echo -e "\n### Now tagging, signing and pushing..."
@@ -70,7 +76,7 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz"
git checkout HEAD -- youtube-dl youtube-dl.exe
/bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..."
-for f in $RELEASE_FILES; do gpg --detach-sig "build/$version/$f"; done
+for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done
scp -r "build/$version" ytdl@yt-dl.org:html/tmp/
ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/"
ssh ytdl@yt-dl.org "sh html/update_latest.sh $version"
@@ -97,7 +103,7 @@ rm -rf build
make pypi-files
echo "Uploading to PyPi ..."
-python setup.py sdist upload
+python setup.py sdist bdist_wheel upload
make clean
/bin/echo -e "\n### DONE!"
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..69fa449
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+_build/
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..7122180
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+ rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/youtube-dl.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/youtube-dl.qhc"
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/youtube-dl"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/youtube-dl"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..4a04ad7
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+#
+# youtube-dl documentation build configuration file, created by
+# sphinx-quickstart on Fri Mar 14 21:05:43 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+# Allows to import youtube_dl
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# -- General configuration ------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'youtube-dl'
+copyright = u'2014, Ricardo Garcia Gonzalez'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+import youtube_dl
+version = youtube_dl.__version__
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'youtube-dldoc'
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..b746ff9
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,23 @@
+Welcome to youtube-dl's documentation!
+======================================
+
+*youtube-dl* is a command-line program to download videos from YouTube.com and more sites.
+It can also be used in Python code.
+
+Developer guide
+---------------
+
+This section contains information for using *youtube-dl* from Python programs.
+
+.. toctree::
+ :maxdepth: 2
+
+ module_guide
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/docs/module_guide.rst b/docs/module_guide.rst
new file mode 100644
index 0000000..03d7288
--- /dev/null
+++ b/docs/module_guide.rst
@@ -0,0 +1,67 @@
+Using the ``youtube_dl`` module
+===============================
+
+When using the ``youtube_dl`` module, you start by creating an instance of :class:`YoutubeDL` and adding all the available extractors:
+
+.. code-block:: python
+
+ >>> from youtube_dl import YoutubeDL
+ >>> ydl = YoutubeDL()
+ >>> ydl.add_default_info_extractors()
+
+Extracting video information
+----------------------------
+
+You use the :meth:`YoutubeDL.extract_info` method for getting the video information, which returns a dictionary:
+
+.. code-block:: python
+
+ >>> info = ydl.extract_info('http://www.youtube.com/watch?v=BaW_jenozKc', download=False)
+ [youtube] Setting language
+ [youtube] BaW_jenozKc: Downloading webpage
+ [youtube] BaW_jenozKc: Downloading video info webpage
+ [youtube] BaW_jenozKc: Extracting video information
+ >>> info['title']
+ 'youtube-dl test video "\'/\\ä↭𝕐'
+ >>> info['height'], info['width']
+ (720, 1280)
+
+If you want to download or play the video you can get its url:
+
+.. code-block:: python
+
+ >>> info['url']
+ 'https://...'
+
+Extracting playlist information
+-------------------------------
+
+The playlist information is extracted in a similar way, but the dictionary is a bit different:
+
+.. code-block:: python
+
+ >>> playlist = ydl.extract_info('http://www.ted.com/playlists/13/open_source_open_world', download=False)
+ [TED] open_source_open_world: Downloading playlist webpage
+ ...
+ >>> playlist['title']
+ 'Open-source, open world'
+
+
+
+You can access the videos in the playlist with the ``entries`` field:
+
+.. code-block:: python
+
+ >>> for video in playlist['entries']:
+ ... print('Video #%d: %s' % (video['playlist_index'], video['title']))
+
+ Video #1: How Arduino is open-sourcing imagination
+ Video #2: The year open data went worldwide
+ Video #3: Massive-scale online collaboration
+ Video #4: The art of asking
+ Video #5: How cognitive surplus will change the world
+ Video #6: The birth of Wikipedia
+ Video #7: Coding a better government
+ Video #8: The era of open innovation
+ Video #9: The currency of the new economy is trust
+
diff --git a/test/helper.py b/test/helper.py
index b1f421a..230d2bd 100644
--- a/test/helper.py
+++ b/test/helper.py
@@ -9,7 +9,10 @@ import sys
import youtube_dl.extractor
from youtube_dl import YoutubeDL
-from youtube_dl.utils import preferredencoding
+from youtube_dl.utils import (
+ compat_str,
+ preferredencoding,
+)
def get_params(override=None):
@@ -71,15 +74,77 @@ class FakeYDL(YoutubeDL):
old_report_warning(message)
self.report_warning = types.MethodType(report_warning, self)
-def get_testcases():
+
+def gettestcases(include_onlymatching=False):
for ie in youtube_dl.extractor.gen_extractors():
t = getattr(ie, '_TEST', None)
if t:
- t['name'] = type(ie).__name__[:-len('IE')]
- yield t
- for t in getattr(ie, '_TESTS', []):
+ assert not hasattr(ie, '_TESTS'), \
+ '%s has _TEST and _TESTS' % type(ie).__name__
+ tests = [t]
+ else:
+ tests = getattr(ie, '_TESTS', [])
+ for t in tests:
+ if not include_onlymatching and t.get('only_matching', False):
+ continue
t['name'] = type(ie).__name__[:-len('IE')]
yield t
md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+def expect_info_dict(self, expected_dict, got_dict):
+ for info_field, expected in expected_dict.items():
+ if isinstance(expected, compat_str) and expected.startswith('re:'):
+ got = got_dict.get(info_field)
+ match_str = expected[len('re:'):]
+ match_rex = re.compile(match_str)
+
+ self.assertTrue(
+ isinstance(got, compat_str) and match_rex.match(got),
+ u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+ elif isinstance(expected, type):
+ got = got_dict.get(info_field)
+ self.assertTrue(isinstance(got, expected),
+ u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
+ else:
+ if isinstance(expected, compat_str) and expected.startswith('md5:'):
+ got = 'md5:' + md5(got_dict.get(info_field))
+ else:
+ got = got_dict.get(info_field)
+ self.assertEqual(expected, got,
+ u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+
+ # Check for the presence of mandatory fields
+ for key in ('id', 'url', 'title', 'ext'):
+ self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
+ # Check for mandatory fields that are automatically set by YoutubeDL
+ for key in ['webpage_url', 'extractor', 'extractor_key']:
+ self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
+
+ # Are checkable fields missing from the test case definition?
+ test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
+ for key, value in got_dict.items()
+ if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
+ missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
+ if missing_keys:
+ sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
+ self.assertFalse(
+ missing_keys,
+ 'Missing keys in test definition: %s' % (
+ ', '.join(sorted(missing_keys))))
+
+
+def assertRegexpMatches(self, text, regexp, msg=None):
+ if hasattr(self, 'assertRegexpMatches'):
+ return self.assertRegexpMatches(text, regexp, msg)
+ else:
+ m = re.match(regexp, text)
+ if not m:
+ note = 'Regexp didn\'t match: %r not found in %r' % (regexp, text)
+ if msg is None:
+ msg = note
+ else:
+ msg = note + ', ' + msg
+ self.assertTrue(m, msg)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
new file mode 100644
index 0000000..13c18ed
--- /dev/null
+++ b/test/test_InfoExtractor.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL
+from youtube_dl.extractor.common import InfoExtractor
+from youtube_dl.extractor import YoutubeIE, get_info_extractor
+
+
+class TestIE(InfoExtractor):
+ pass
+
+
+class TestInfoExtractor(unittest.TestCase):
+ def setUp(self):
+ self.ie = TestIE(FakeYDL())
+
+ def test_ie_key(self):
+ self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE)
+
+ def test_html_search_regex(self):
+ html = '<p id="foo">Watch this <a href="http://www.youtube.com/watch?v=BaW_jenozKc">video</a></p>'
+ search = lambda re, *args: self.ie._html_search_regex(re, html, *args)
+ self.assertEqual(search(r'<p id="foo">(.+?)</p>', 'foo'), 'Watch this video')
+
+ def test_opengraph(self):
+ ie = self.ie
+ html = '''
+ <meta name="og:title" content='Foo'/>
+ <meta content="Some video's description " name="og:description"/>
+ <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/>
+ '''
+ self.assertEqual(ie._og_search_title(html), 'Foo')
+ self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
+ self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 37e7b9b..e794cc9 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -8,7 +8,7 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL
+from test.helper import FakeYDL, assertRegexpMatches
from youtube_dl import YoutubeDL
from youtube_dl.extractor import YoutubeIE
@@ -26,16 +26,27 @@ class YDL(FakeYDL):
self.msgs.append(msg)
+def _make_result(formats, **kwargs):
+ res = {
+ 'formats': formats,
+ 'id': 'testid',
+ 'title': 'testttitle',
+ 'extractor': 'testex',
+ }
+ res.update(**kwargs)
+ return res
+
+
class TestFormatSelection(unittest.TestCase):
def test_prefer_free_formats(self):
# Same resolution => download webm
ydl = YDL()
ydl.params['prefer_free_formats'] = True
formats = [
- {'ext': 'webm', 'height': 460},
- {'ext': 'mp4', 'height': 460},
+ {'ext': 'webm', 'height': 460, 'url': 'x'},
+ {'ext': 'mp4', 'height': 460, 'url': 'y'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
ydl.process_ie_result(info_dict)
@@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL()
ydl.params['prefer_free_formats'] = True
formats = [
- {'ext': 'webm', 'height': 720},
- {'ext': 'mp4', 'height': 1080},
+ {'ext': 'webm', 'height': 720, 'url': 'a'},
+ {'ext': 'mp4', 'height': 1080, 'url': 'b'},
]
info_dict['formats'] = formats
yie = YoutubeIE(ydl)
@@ -56,13 +67,13 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['ext'], 'mp4')
- # No prefer_free_formats => prefer mp4 and flv for greater compatibilty
+ # No prefer_free_formats => prefer mp4 and flv for greater compatibility
ydl = YDL()
ydl.params['prefer_free_formats'] = False
formats = [
- {'ext': 'webm', 'height': 720},
- {'ext': 'mp4', 'height': 720},
- {'ext': 'flv', 'height': 720},
+ {'ext': 'webm', 'height': 720, 'url': '_'},
+ {'ext': 'mp4', 'height': 720, 'url': '_'},
+ {'ext': 'flv', 'height': 720, 'url': '_'},
]
info_dict['formats'] = formats
yie = YoutubeIE(ydl)
@@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL()
ydl.params['prefer_free_formats'] = False
formats = [
- {'ext': 'flv', 'height': 720},
- {'ext': 'webm', 'height': 720},
+ {'ext': 'flv', 'height': 720, 'url': '_'},
+ {'ext': 'webm', 'height': 720, 'url': '_'},
]
info_dict['formats'] = formats
yie = YoutubeIE(ydl)
@@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase):
{'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3},
{'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4},
]
- info_dict = {
- 'formats': formats, 'extractor': 'test', 'id': 'testvid'}
+ info_dict = _make_result(formats)
ydl = YDL()
ydl.process_ie_result(info_dict)
@@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection(self):
formats = [
- {'format_id': '35', 'ext': 'mp4', 'preference': 1},
- {'format_id': '45', 'ext': 'webm', 'preference': 2},
- {'format_id': '47', 'ext': 'webm', 'preference': 3},
- {'format_id': '2', 'ext': 'flv', 'preference': 4},
+ {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'},
+ {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'},
+ {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'},
+ {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': '20/47'})
ydl.process_ie_result(info_dict.copy())
@@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase):
def test_format_selection_audio(self):
formats = [
- {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'},
- {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'},
- {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'},
- {'format_id': 'vid', 'ext': 'mp4', 'preference': 4},
+ {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'},
+ {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'},
+ {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'},
+ {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': 'bestaudio'})
ydl.process_ie_result(info_dict.copy())
@@ -172,16 +182,34 @@ class TestFormatSelection(unittest.TestCase):
self.assertEqual(downloaded['format_id'], 'audio-low')
formats = [
- {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1},
- {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2},
+ {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'},
+ {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'},
]
- info_dict = {'formats': formats, 'extractor': 'test'}
+ info_dict = _make_result(formats)
ydl = YDL({'format': 'bestaudio/worstaudio/best'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'vid-high')
+ def test_format_selection_video(self):
+ formats = [
+ {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'},
+ {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'},
+ {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'},
+ ]
+ info_dict = _make_result(formats)
+
+ ydl = YDL({'format': 'bestvideo'})
+ ydl.process_ie_result(info_dict.copy())
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'dash-video-high')
+
+ ydl = YDL({'format': 'worstvideo'})
+ ydl.process_ie_result(info_dict.copy())
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(downloaded['format_id'], 'dash-video-low')
+
def test_youtube_format_selection(self):
order = [
'38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13',
@@ -199,10 +227,12 @@ class TestFormatSelection(unittest.TestCase):
for f1id, f2id in zip(order, order[1:]):
f1 = YoutubeIE._formats[f1id].copy()
f1['format_id'] = f1id
+ f1['url'] = 'url:' + f1id
f2 = YoutubeIE._formats[f2id].copy()
f2['format_id'] = f2id
+ f2['url'] = 'url:' + f2id
- info_dict = {'formats': [f1, f2], 'extractor': 'youtube'}
+ info_dict = _make_result([f1, f2], extractor='youtube')
ydl = YDL()
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
@@ -210,7 +240,7 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], f1id)
- info_dict = {'formats': [f2, f1], 'extractor': 'youtube'}
+ info_dict = _make_result([f2, f1], extractor='youtube')
ydl = YDL()
yie = YoutubeIE(ydl)
yie._sort_formats(info_dict['formats'])
@@ -244,6 +274,12 @@ class TestFormatSelection(unittest.TestCase):
# Replace missing fields with 'NA'
self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4')
+ def test_format_note(self):
+ ydl = YoutubeDL()
+ self.assertEqual(ydl._format_note({}), '')
+ assertRegexpMatches(self, ydl._format_note({
+ 'vbr': 10,
+ }), '^\s*10k$')
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py
index c9cdb96..71e80b0 100644
--- a/test/test_age_restriction.py
+++ b/test/test_age_restriction.py
@@ -13,7 +13,7 @@ from youtube_dl import YoutubeDL
def _download_restricted(url, filename, age):
- """ Returns true iff the file has been downloaded """
+ """ Returns true if the file has been downloaded """
params = {
'age_limit': age,
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index aa8e4e4..4b56137 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -9,7 +9,7 @@ import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import get_testcases
+from test.helper import gettestcases
from youtube_dl.extractor import (
FacebookIE,
@@ -49,6 +49,7 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
+ self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
@@ -68,21 +69,28 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_show_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
+ def test_youtube_truncated(self):
+ self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
+
+ def test_youtube_search_matching(self):
+ self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
+ self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
+
def test_justin_tv_channelid_matching(self):
- self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
- self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
- self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv"))
- self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv"))
- self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv"))
- self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv"))
- self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/"))
- self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/"))
+ self.assertTrue(JustinTVIE.suitable('justin.tv/vanillatv'))
+ self.assertTrue(JustinTVIE.suitable('twitch.tv/vanillatv'))
+ self.assertTrue(JustinTVIE.suitable('www.justin.tv/vanillatv'))
+ self.assertTrue(JustinTVIE.suitable('www.twitch.tv/vanillatv'))
+ self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv'))
+ self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv'))
+ self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv/'))
+ self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/'))
def test_justintv_videoid_matching(self):
- self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483"))
+ self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/b/328087483'))
def test_justin_tv_chapterid_matching(self):
- self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))
+ self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361'))
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
@@ -98,7 +106,7 @@ class TestAllURLsMatching(unittest.TestCase):
def test_no_duplicates(self):
ies = gen_extractors()
- for tc in get_testcases():
+ for tc in gettestcases(include_onlymatching=True):
url = tc['url']
for ie in ies:
if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'):
@@ -117,6 +125,8 @@ class TestAllURLsMatching(unittest.TestCase):
def test_vimeo_matching(self):
self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
+ self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel'])
+ self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo'])
self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user'])
self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user'])
self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review'])
@@ -132,6 +142,40 @@ class TestAllURLsMatching(unittest.TestCase):
def test_pbs(self):
# https://github.com/rg3/youtube-dl/issues/2350
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
+ self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
+
+ def test_ComedyCentralShows(self):
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights',
+ ['ComedyCentralShows'])
+ self.assertMatch(
+ 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food',
+ ['ComedyCentralShows'])
+
+ def test_yahoo_https(self):
+ # https://github.com/rg3/youtube-dl/issues/2701
+ self.assertMatch(
+ 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
+ ['Yahoo'])
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_download.py b/test/test_download.py
index 7587a18..f171c10 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -8,10 +8,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import (
get_params,
- get_testcases,
- try_rm,
+ gettestcases,
+ expect_info_dict,
md5,
- report_warning
+ try_rm,
+ report_warning,
)
@@ -50,7 +51,7 @@ def _file_md5(fn):
with open(fn, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()
-defs = get_testcases()
+defs = gettestcases()
class TestDownload(unittest.TestCase):
@@ -72,9 +73,7 @@ def generator(test_case):
if 'playlist' not in test_case:
info_dict = test_case.get('info_dict', {})
if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
- print_skipping('The output file cannot be know, the "file" '
- 'key is missing or the info_dict is incomplete')
- return
+ raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?')
if 'skip' in test_case:
print_skipping(test_case['skip'])
return
@@ -136,27 +135,8 @@ def generator(test_case):
self.assertEqual(md5_for_file, tc['md5'])
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
- for (info_field, expected) in tc.get('info_dict', {}).items():
- if isinstance(expected, compat_str) and expected.startswith('md5:'):
- got = 'md5:' + md5(info_dict.get(info_field))
- else:
- got = info_dict.get(info_field)
- self.assertEqual(expected, got,
- u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
-
- # If checkable fields are missing from the test case, print the info_dict
- test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
- for key, value in info_dict.items()
- if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location'))
- if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
- sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
-
- # Check for the presence of mandatory fields
- for key in ('id', 'url', 'title', 'ext'):
- self.assertTrue(key in info_dict.keys() and info_dict[key])
- # Check for mandatory fields that are automatically set by YoutubeDL
- for key in ['webpage_url', 'extractor', 'extractor_key']:
- self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
+
+ expect_info_dict(self, tc.get('info_dict', {}), info_dict)
finally:
try_rm_tcs_files()
diff --git a/test/test_playlists.py b/test/test_playlists.py
index 1de9e8e..465b07b 100644
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -9,8 +9,11 @@ import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL
-
+from test.helper import (
+ assertRegexpMatches,
+ expect_info_dict,
+ FakeYDL,
+)
from youtube_dl.extractor import (
AcademicEarthCourseIE,
@@ -20,9 +23,12 @@ from youtube_dl.extractor import (
VimeoUserIE,
VimeoAlbumIE,
VimeoGroupsIE,
+ VineUserIE,
UstreamChannelIE,
SoundcloudSetIE,
SoundcloudUserIE,
+ SoundcloudPlaylistIE,
+ TeacherTubeClassroomIE,
LivestreamIE,
NHLVideocenterIE,
BambuserChannelIE,
@@ -36,6 +42,12 @@ from youtube_dl.extractor import (
RutubeChannelIE,
GoogleSearchIE,
GenericIE,
+ TEDIE,
+ ToypicsUserIE,
+ XTubeUserIE,
+ InstagramUserIE,
+ CSpanIE,
+ AolIE,
)
@@ -92,13 +104,20 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['title'], 'Rolex Awards for Enterprise')
self.assertTrue(len(result['entries']) > 72)
+ def test_vine_user(self):
+ dl = FakeYDL()
+ ie = VineUserIE(dl)
+ result = ie.extract('https://vine.co/Visa')
+ self.assertIsPlaylist(result)
+ self.assertTrue(len(result['entries']) >= 50)
+
def test_ustream_channel(self):
dl = FakeYDL()
ie = UstreamChannelIE(dl)
result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty')
self.assertIsPlaylist(result)
self.assertEqual(result['id'], '5124905')
- self.assertTrue(len(result['entries']) >= 11)
+ self.assertTrue(len(result['entries']) >= 6)
def test_soundcloud_set(self):
dl = FakeYDL()
@@ -116,6 +135,17 @@ class TestPlaylists(unittest.TestCase):
self.assertEqual(result['id'], '9615865')
self.assertTrue(len(result['entries']) >= 12)
+ def test_soundcloud_playlist(self):
+ dl = FakeYDL()
+ ie = SoundcloudPlaylistIE(dl)
+ result = ie.extract('http://api.soundcloud.com/playlists/4110309')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], '4110309')
+ self.assertEqual(result['title'], 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]')
+ assertRegexpMatches(
+ self, result['description'], r'TILT Brass - Bowery Poetry Club')
+ self.assertEqual(len(result['entries']), 6)
+
def test_livestream_event(self):
dl = FakeYDL()
ie = LivestreamIE(dl)
@@ -170,30 +200,30 @@ class TestPlaylists(unittest.TestCase):
def test_AcademicEarthCourse(self):
dl = FakeYDL()
ie = AcademicEarthCourseIE(dl)
- result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/')
+ result = ie.extract('http://academicearth.org/playlists/laws-of-nature/')
self.assertIsPlaylist(result)
- self.assertEqual(result['id'], 'building-dynamic-websites')
- self.assertEqual(result['title'], 'Building Dynamic Websites')
- self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
- self.assertEqual(len(result['entries']), 10)
+ self.assertEqual(result['id'], 'laws-of-nature')
+ self.assertEqual(result['title'], 'Laws of Nature')
+ self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
+ self.assertEqual(len(result['entries']), 4)
def test_ivi_compilation(self):
dl = FakeYDL()
ie = IviCompilationIE(dl)
- result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel')
+ result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa')
self.assertIsPlaylist(result)
- self.assertEqual(result['id'], 'dezhurnyi_angel')
- self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012)')
- self.assertTrue(len(result['entries']) >= 36)
-
+ self.assertEqual(result['id'], 'dvoe_iz_lartsa')
+ self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008)')
+ self.assertTrue(len(result['entries']) >= 24)
+
def test_ivi_compilation_season(self):
dl = FakeYDL()
ie = IviCompilationIE(dl)
- result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2')
+ result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa/season1')
self.assertIsPlaylist(result)
- self.assertEqual(result['id'], 'dezhurnyi_angel/season2')
- self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012) 2 сезон')
- self.assertTrue(len(result['entries']) >= 20)
+ self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1')
+ self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008) 1 сезон')
+ self.assertTrue(len(result['entries']) >= 12)
def test_imdb_list(self):
dl = FakeYDL()
@@ -248,7 +278,96 @@ class TestPlaylists(unittest.TestCase):
self.assertIsPlaylist(result)
self.assertEqual(result['id'], 'python language')
self.assertEqual(result['title'], 'python language')
- self.assertTrue(len(result['entries']) == 15)
+ self.assertEqual(len(result['entries']), 15)
+
+ def test_generic_rss_feed(self):
+ dl = FakeYDL()
+ ie = GenericIE(dl)
+ result = ie.extract('http://phihag.de/2014/youtube-dl/rss.xml')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'http://phihag.de/2014/youtube-dl/rss.xml')
+ self.assertEqual(result['title'], 'Zero Punctuation')
+ self.assertTrue(len(result['entries']) > 10)
+
+ def test_ted_playlist(self):
+ dl = FakeYDL()
+ ie = TEDIE(dl)
+ result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], '10')
+ self.assertEqual(result['title'], 'Who are the hackers?')
+ self.assertTrue(len(result['entries']) >= 6)
+
+ def test_toypics_user(self):
+ dl = FakeYDL()
+ ie = ToypicsUserIE(dl)
+ result = ie.extract('http://videos.toypics.net/Mikey')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'Mikey')
+ self.assertTrue(len(result['entries']) >= 17)
+
+ def test_xtube_user(self):
+ dl = FakeYDL()
+ ie = XTubeUserIE(dl)
+ result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'greenshowers')
+ self.assertTrue(len(result['entries']) >= 155)
+
+ def test_InstagramUser(self):
+ dl = FakeYDL()
+ ie = InstagramUserIE(dl)
+ result = ie.extract('http://instagram.com/porsche')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'porsche')
+ self.assertTrue(len(result['entries']) >= 2)
+ test_video = next(
+ e for e in result['entries']
+ if e['id'] == '614605558512799803_462752227')
+ dl.add_default_extra_info(test_video, ie, '(irrelevant URL)')
+ dl.process_video_result(test_video, download=False)
+ EXPECTED = {
+ 'id': '614605558512799803_462752227',
+ 'ext': 'mp4',
+ 'title': '#Porsche Intelligent Performance.',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Porsche',
+ 'uploader_id': 'porsche',
+ 'timestamp': 1387486713,
+ 'upload_date': '20131219',
+ }
+ expect_info_dict(self, EXPECTED, test_video)
+
+ def test_CSpan_playlist(self):
+ dl = FakeYDL()
+ ie = CSpanIE(dl)
+ result = ie.extract(
+ 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], '342759')
+ self.assertEqual(
+ result['title'], 'General Motors Ignition Switch Recall')
+ whole_duration = sum(e['duration'] for e in result['entries'])
+ self.assertEqual(whole_duration, 14855)
+
+ def test_aol_playlist(self):
+ dl = FakeYDL()
+ ie = AolIE(dl)
+ result = ie.extract(
+ 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], '152147')
+ self.assertEqual(
+ result['title'], 'Brace Yourself - Today\'s Weirdest News')
+ self.assertTrue(len(result['entries']) >= 10)
+
+ def test_TeacherTubeClassroom(self):
+ dl = FakeYDL()
+ ie = TeacherTubeClassroomIE(dl)
+ result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], 'rbhagwati2')
+ self.assertTrue(len(result['entries']) >= 20)
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
index 79991e6..5736fe5 100644
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -181,7 +181,7 @@ class TestTedSubtitles(BaseTestSubtitles):
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
- self.assertEqual(len(subtitles.keys()), 28)
+ self.assertTrue(len(subtitles.keys()) >= 28)
def test_list_subtitles(self):
self.DL.expect_warning(u'Automatic Captions not supported by this server')
diff --git a/test/test_utils.py b/test/test_utils.py
index 84553b9..51eb0b6 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -9,6 +9,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Various small unit tests
+import io
+import json
import xml.etree.ElementTree
#from youtube_dl.utils import htmlentity_transform
@@ -21,6 +23,7 @@ from youtube_dl.utils import (
orderedSet,
PagedList,
parse_duration,
+ read_batch_urls,
sanitize_filename,
shell_quote,
smuggle_url,
@@ -31,7 +34,11 @@ from youtube_dl.utils import (
unified_strdate,
unsmuggle_url,
url_basename,
+ urlencode_postdata,
xpath_with_ns,
+ parse_iso8601,
+ strip_jsonp,
+ uppercase_escape,
)
if sys.version_info < (3, 0):
@@ -250,5 +257,32 @@ class TestUtil(unittest.TestCase):
def test_struct_unpack(self):
self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,))
+ def test_read_batch_urls(self):
+ f = io.StringIO(u'''\xef\xbb\xbf foo
+ bar\r
+ baz
+ # More after this line\r
+ ; or after this
+ bam''')
+ self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
+
+ def test_urlencode_postdata(self):
+ data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
+ self.assertTrue(isinstance(data, bytes))
+
+ def test_parse_iso8601(self):
+ self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
+ self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
+ self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
+
+ def test_strip_jsonp(self):
+ stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
+ d = json.loads(stripped)
+ self.assertEqual(d, [{"id": "532cb", "x": 3}])
+
+ def test_uppercase_escpae(self):
+ self.assertEqual(uppercase_escape(u'aä'), u'aä')
+ self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐')
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py
index 38ac989..3aadedd 100644
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -16,6 +16,7 @@ from youtube_dl.extractor import (
YoutubeChannelIE,
YoutubeShowIE,
YoutubeTopListIE,
+ YoutubeSearchURLIE,
)
@@ -111,13 +112,15 @@ class TestYoutubeLists(unittest.TestCase):
def test_youtube_mix(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y')
+ result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w')
entries = result['entries']
self.assertTrue(len(entries) >= 20)
original_video = entries[0]
- self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
+ self.assertEqual(original_video['id'], 'OQpdSVF_k_w')
def test_youtube_toptracks(self):
+ print('Skipping: The playlist page gives error 500')
+ return
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
@@ -131,5 +134,14 @@ class TestYoutubeLists(unittest.TestCase):
entries = result['entries']
self.assertTrue(len(entries) >= 5)
+ def test_youtube_search_url(self):
+ dl = FakeYDL()
+ ie = YoutubeSearchURLIE(dl)
+ result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
+ entries = result['entries']
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['title'], 'youtube-dl test video')
+ self.assertTrue(len(entries) >= 5)
+
if __name__ == '__main__':
unittest.main()
diff --git a/youtube-dl b/youtube-dl
index 063e40d..b98d36a 100755
--- a/youtube-dl
+++ b/youtube-dl
Binary files differ
diff --git a/youtube-dl.1 b/youtube-dl.1
index 7abbe59..f17addd 100644
--- a/youtube-dl.1
+++ b/youtube-dl.1
@@ -24,7 +24,7 @@ redistribute it or use it however you like.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ sure\ that\ you\ have\ sufficient\ permissions
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (run\ with\ sudo\ if\ needed)
\-i,\ \-\-ignore\-errors\ \ \ \ \ \ \ \ \ \ \ \ \ \ continue\ on\ download\ errors,\ for\ example\ to
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ to\ skip\ unavailable\ videos\ in\ a\ playlist
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ skip\ unavailable\ videos\ in\ a\ playlist
\-\-abort\-on\-error\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Abort\ downloading\ of\ further\ videos\ (in\ the
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ playlist\ or\ the\ command\ line)\ if\ an\ error
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ occurs
@@ -32,6 +32,9 @@ redistribute it or use it however you like.
\-\-user\-agent\ UA\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ user\ agent
\-\-referer\ REF\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ referer,\ use\ if\ the\ video
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ access\ is\ restricted\ to\ one\ domain
+\-\-add\-header\ FIELD:VALUE\ \ \ \ \ \ \ \ \ specify\ a\ custom\ HTTP\ header\ and\ its\ value,
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ separated\ by\ a\ colon\ \[aq]:\[aq].\ You\ can\ use\ this
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ option\ multiple\ times
\-\-list\-extractors\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ List\ all\ supported\ extractors\ and\ the\ URLs
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ they\ would\ handle
\-\-extractor\-descriptions\ \ \ \ \ \ \ \ \ Output\ descriptions\ of\ all\ supported
@@ -40,6 +43,9 @@ redistribute it or use it however you like.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ an\ empty\ string\ (\-\-proxy\ "")\ for\ direct
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ connection
\-\-no\-check\-certificate\ \ \ \ \ \ \ \ \ \ \ Suppress\ HTTPS\ certificate\ validation.
+\-\-prefer\-insecure\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ an\ unencrypted\ connection\ to\ retrieve
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ information\ about\ the\ video.\ (Currently
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ supported\ only\ for\ YouTube)
\-\-cache\-dir\ DIR\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Location\ in\ the\ filesystem\ where\ youtube\-dl
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ can\ store\ some\ downloaded\ information
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ permanently.\ By\ default\ $XDG_CACHE_HOME
@@ -63,6 +69,7 @@ redistribute it or use it however you like.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ configuration\ in\ ~/.config/youtube\-dl.conf
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (%APPDATA%/youtube\-dl/config.txt\ on
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Windows)
+\-\-encoding\ ENCODING\ \ \ \ \ \ \ \ \ \ \ \ \ \ Force\ the\ specified\ encoding\ (experimental)
\f[]
.fi
.SS Video Selection:
@@ -140,8 +147,12 @@ redistribute it or use it however you like.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ id,\ %(playlist)s\ for\ the\ playlist\ the
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ is\ in,\ %(playlist_index)s\ for\ the
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ position\ in\ the\ playlist\ and\ %%\ for\ a
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ literal\ percent.\ Use\ \-\ to\ output\ to\ stdout.
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Can\ also\ be\ used\ to\ download\ to\ a\ different
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ literal\ percent.\ %(height)s\ and\ %(width)s
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ for\ the\ width\ and\ height\ of\ the\ video
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ format.\ %(resolution)s\ for\ a\ textual
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ description\ of\ the\ resolution\ of\ the\ video
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ format.\ Use\ \-\ to\ output\ to\ stdout.\ Can\ also
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ be\ used\ to\ download\ to\ a\ different
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ directory,\ for\ example\ with\ \-o\ \[aq]/my/downloa
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ds/%(uploader)s/%(title)s\-%(id)s.%(ext)s\[aq]\ .
\-\-autonumber\-size\ NUMBER\ \ \ \ \ \ \ \ \ Specifies\ the\ number\ of\ digits\ in
@@ -179,6 +190,7 @@ redistribute it or use it however you like.
.nf
\f[C]
\-q,\ \-\-quiet\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ activates\ quiet\ mode
+\-\-no\-warnings\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Ignore\ warnings
\-s,\ \-\-simulate\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video\ and\ do\ not\ write
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ anything\ to\ disk
\-\-skip\-download\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video
@@ -190,7 +202,9 @@ redistribute it or use it however you like.
\-\-get\-duration\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ video\ length
\-\-get\-filename\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ filename
\-\-get\-format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ format
-\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information
+\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information.
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ See\ \-\-output\ for\ a\ description\ of\ available
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ keys.
\-\-newline\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ progress\ bar\ as\ new\ lines
\-\-no\-progress\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ print\ progress\ bar
\-\-console\-title\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ display\ progress\ in\ console\ titlebar
@@ -211,9 +225,9 @@ redistribute it or use it however you like.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ preference\ using\ slashes:\ "\-f\ 22/17/18".
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "\-f\ mp4"\ and\ "\-f\ flv"\ are\ also\ supported.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ You\ can\ also\ use\ the\ special\ names\ "best",
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "bestaudio",\ "worst",\ and\ "worstaudio".\ By
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default,\ youtube\-dl\ will\ pick\ the\ best
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ quality.
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "bestvideo",\ "bestaudio",\ "worst",
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "worstvideo"\ and\ "worstaudio".\ By\ default,
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ youtube\-dl\ will\ pick\ the\ best\ quality.
\-\-all\-formats\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ all\ available\ video\ formats
\-\-prefer\-free\-formats\ \ \ \ \ \ \ \ \ \ \ \ prefer\ free\ video\ formats\ unless\ a\ specific
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ one\ is\ requested
@@ -272,6 +286,7 @@ redistribute it or use it however you like.
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default
\-\-embed\-subs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ embed\ subtitles\ in\ the\ video\ (only\ for\ mp4
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ videos)
+\-\-embed\-thumbnail\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ embed\ thumbnail\ in\ the\ audio\ as\ cover\ art
\-\-add\-metadata\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ metadata\ to\ the\ video\ file
\-\-xattrs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ metadata\ to\ the\ video\ file\[aq]s\ xattrs
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (using\ dublin\ core\ and\ xdg\ standards)
@@ -286,7 +301,7 @@ redistribute it or use it however you like.
You can configure youtube\-dl by placing default arguments (such as
\f[C]\-\-extract\-audio\ \-\-no\-mtime\f[] to always extract the audio
and not copy the mtime) into \f[C]/etc/youtube\-dl.conf\f[] and/or
-\f[C]~/.config/youtube\-dl.conf\f[].
+\f[C]~/.config/youtube\-dl/config\f[].
On Windows, the configuration file locations are
\f[C]%APPDATA%\\youtube\-dl\\config.txt\f[] and
\f[C]C:\\Users\\<Yourname>\\youtube\-dl.conf\f[].
@@ -359,12 +374,19 @@ Relative dates: Dates in the format
\f[C](now|today)[+\-][0\-9](day|week|month|year)(s)?\f[]
.PP
Examples:
-.PP
-$ # Download only the videos uploaded in the last 6 months $ youtube\-dl
-\-\-dateafter now\-6months $ # Download only the videos uploaded on
-January 1, 1970 $ youtube\-dl \-\-date 19700101 $ # will only download
-the videos uploaded in the 200x decade $ youtube\-dl \-\-dateafter
-20000101 \-\-datebefore 20091231
+.IP
+.nf
+\f[C]
+#\ Download\ only\ the\ videos\ uploaded\ in\ the\ last\ 6\ months
+$\ youtube\-dl\ \-\-dateafter\ now\-6months
+
+#\ Download\ only\ the\ videos\ uploaded\ on\ January\ 1,\ 1970
+$\ youtube\-dl\ \-\-date\ 19700101
+
+$\ #\ will\ only\ download\ the\ videos\ uploaded\ in\ the\ 200x\ decade
+$\ youtube\-dl\ \-\-dateafter\ 20000101\ \-\-datebefore\ 20091231
+\f[]
+.fi
.SH FAQ
.SS Can you please put the \-b option back?
.PP
@@ -473,19 +495,108 @@ zip
nosetests
.SS Adding support for a new site
.PP
-If you want to add support for a new site, copy \f[I]any\f[] recently
-modified (https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor)
-file in \f[C]youtube_dl/extractor\f[], add an import in
+If you want to add support for a new site, you can follow this quick
+list (assuming your service is called \f[C]yourextractor\f[]):
+.IP " 1." 4
+Fork this repository (https://github.com/rg3/youtube-dl/fork)
+.IP " 2." 4
+Check out the source code with
+\f[C]git\ clone\ git\@github.com:YOUR_GITHUB_USERNAME/youtube\-dl.git\f[]
+.IP " 3." 4
+Start a new git branch with
+\f[C]cd\ youtube\-dl;\ git\ checkout\ \-b\ yourextractor\f[]
+.IP " 4." 4
+Start with this simple template and save it to
+\f[C]youtube_dl/extractor/yourextractor.py\f[]:
+.RS 4
+.IP
+.nf
+\f[C]
+#\ coding:\ utf\-8
+from\ __future__\ import\ unicode_literals
+
+import\ re
+
+from\ .common\ import\ InfoExtractor
+
+
+class\ YourExtractorIE(InfoExtractor):
+\ \ \ \ _VALID_URL\ =\ r\[aq]https?://(?:www\\.)?yourextractor\\.com/watch/(?P<id>[0\-9]+)\[aq]
+\ \ \ \ _TEST\ =\ {
+\ \ \ \ \ \ \ \ \[aq]url\[aq]:\ \[aq]http://yourextractor.com/watch/42\[aq],
+\ \ \ \ \ \ \ \ \[aq]md5\[aq]:\ \[aq]TODO:\ md5\ sum\ of\ the\ first\ 10KiB\ of\ the\ video\ file\[aq],
+\ \ \ \ \ \ \ \ \[aq]info_dict\[aq]:\ {
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ \[aq]42\[aq],
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]ext\[aq]:\ \[aq]mp4\[aq],
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ \[aq]Video\ title\ goes\ here\[aq],
+\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties,\ either\ as:
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ value
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ MD5\ checksum;\ start\ the\ string\ with\ md5:
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ regular\ expression;\ start\ the\ string\ with\ re:
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ Any\ Python\ type\ (for\ example\ int\ or\ float)
+\ \ \ \ \ \ \ \ }
+\ \ \ \ }
+
+\ \ \ \ def\ _real_extract(self,\ url):
+\ \ \ \ \ \ \ \ mobj\ =\ re.match(self._VALID_URL,\ url)
+\ \ \ \ \ \ \ \ video_id\ =\ mobj.group(\[aq]id\[aq])
+
+\ \ \ \ \ \ \ \ #\ TODO\ more\ code\ goes\ here,\ for\ example\ ...
+\ \ \ \ \ \ \ \ webpage\ =\ self._download_webpage(url,\ video_id)
+\ \ \ \ \ \ \ \ title\ =\ self._html_search_regex(r\[aq]<h1>(.*?)</h1>\[aq],\ webpage,\ \[aq]title\[aq])
+
+\ \ \ \ \ \ \ \ return\ {
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ video_id,
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ title,
+\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties\ (see\ youtube_dl/extractor/common.py)
+\ \ \ \ \ \ \ \ }
+\f[]
+.fi
+.RE
+.IP " 5." 4
+Add an import in
\f[C]youtube_dl/extractor/__init__.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
+.IP " 6." 4
+Run
+\f[C]python\ test/test_download.py\ TestDownload.test_YourExtractor\f[].
+This \f[I]should fail\f[] at first, but you can continually re\-run it
+until you\[aq]re done.
+.IP " 7." 4
Have a look at
\f[C]youtube_dl/common/extractor/common.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py)
for possible helper methods and a detailed description of what your
extractor should
return (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38).
-Don\[aq]t forget to run the tests with
-\f[C]python\ test/test_download.py\ Test_Download.test_YourExtractor\f[]!
-For a detailed tutorial, refer to this blog
-post (http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
+Add tests and code for as many as you want.
+.IP " 8." 4
+If you can, check the code with
+pyflakes (https://pypi.python.org/pypi/pyflakes) (a good idea) and
+pep8 (https://pypi.python.org/pypi/pep8) (optional, ignore E501).
+.IP " 9." 4
+When the tests pass,
+add (https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the
+new files and
+commit (https://www.kernel.org/pub/software/scm/git/docs/git-commit.html)
+them and
+push (https://www.kernel.org/pub/software/scm/git/docs/git-push.html)
+the result, like this:
+.RS 4
+.IP
+.nf
+\f[C]
+$\ git\ add\ youtube_dl/extractor/__init__.py
+$\ git\ add\ youtube_dl/extractor/yourextractor.py
+$\ git\ commit\ \-m\ \[aq][yourextractor]\ Add\ new\ extractor\[aq]
+$\ git\ push\ origin\ yourextractor
+\f[]
+.fi
+.RE
+.IP "10." 4
+Finally, create a pull
+request (https://help.github.com/articles/creating-a-pull-request).
+We\[aq]ll then review and merge it.
+.PP
+In any case, thank you very much for your contributions!
.SH BUGS
.PP
Bugs and suggestions should be reported at:
@@ -537,7 +648,7 @@ For bug reports, this means that your report should contain the
The error message you get for (most) bugs even says so, but you would
not believe how many of our bug reports do not contain this information.
.PP
-Site support requests must contain an example URL.
+Site support requests \f[B]must contain an example URL\f[].
An example URL is a URL you might want to download, like
http://www.youtube.com/watch?v=BaW_jenozKc .
There should be an obvious video present.
diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion
index a5398bb..498e841 100644
--- a/youtube-dl.bash-completion
+++ b/youtube-dl.bash-completion
@@ -4,7 +4,7 @@ __youtube_dl()
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
prev="${COMP_WORDS[COMP_CWORD-1]}"
- opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --socket-timeout --bidi-workaround --default-search --ignore-config --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --no-playlist --age-limit --download-archive --include-ads --youtube-include-dash-manifest --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --load-info --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --print-traffic --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata --xattrs --prefer-avconv --prefer-ffmpeg"
+ opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --add-header --list-extractors --extractor-descriptions --proxy --no-check-certificate --prefer-insecure --cache-dir --no-cache-dir --socket-timeout --bidi-workaround --default-search --ignore-config --encoding --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --no-playlist --age-limit --download-archive --include-ads --youtube-include-dash-manifest --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --load-info --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --print-traffic --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --xattrs --prefer-avconv --prefer-ffmpeg"
keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
fileopts="-a|--batch-file|--download-archive|--cookies|--load-info"
diropts="--cache-dir"
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
deleted file mode 100755
index 672ef9e..0000000
--- a/youtube_dl/InfoExtractors.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Legacy file for backwards compatibility, use youtube_dl.extractor instead!
-
-from .extractor.common import InfoExtractor, SearchInfoExtractor
-from .extractor import gen_extractors, get_info_extractor
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 42cbcf6..dc0ba98 100644..100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -4,9 +4,11 @@
from __future__ import absolute_import, unicode_literals
import collections
+import datetime
import errno
import io
import json
+import locale
import os
import platform
import re
@@ -29,6 +31,7 @@ from .utils import (
ContentTooShortError,
date_from_str,
DateRange,
+ DEFAULT_OUTTMPL,
determine_ext,
DownloadError,
encodeFilename,
@@ -93,6 +96,7 @@ class YoutubeDL(object):
usenetrc: Use netrc for authentication instead.
verbose: Print additional info to stdout.
quiet: Do not print messages to stdout.
+ no_warnings: Do not print out anything for warnings.
forceurl: Force printing final URL.
forcetitle: Force printing title.
forceid: Force printing ID.
@@ -147,6 +151,8 @@ class YoutubeDL(object):
again.
cookiefile: File name where cookies should be read from and dumped to.
nocheckcertificate:Do not verify SSL certificates
+ prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
+ At the moment, this is only supported by YouTube.
proxy: URL of the proxy server to use
socket_timeout: Time to wait for unresponsive hosts, in seconds
bidi_workaround: Work around buggy terminals without bidirectional text
@@ -155,6 +161,7 @@ class YoutubeDL(object):
include_ads: Download ads as well
default_search: Prepend this string if an input url is not valid.
'auto' for elaborate guessing
+ encoding: Use this encoding instead of the system-specified.
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
@@ -280,6 +287,9 @@ class YoutubeDL(object):
"""Print message to stdout if not in quiet mode."""
return self.to_stdout(message, skip_eol, check_quiet=True)
+ def _write_string(self, s, out=None):
+ write_string(s, out=out, encoding=self.params.get('encoding'))
+
def to_stdout(self, message, skip_eol=False, check_quiet=False):
"""Print message to stdout if not in quiet mode."""
if self.params.get('logger'):
@@ -289,7 +299,7 @@ class YoutubeDL(object):
terminator = ['\n', ''][skip_eol]
output = message + terminator
- write_string(output, self._screen_file)
+ self._write_string(output, self._screen_file)
def to_stderr(self, message):
"""Print message to stderr."""
@@ -299,7 +309,7 @@ class YoutubeDL(object):
else:
message = self._bidi_workaround(message)
output = message + '\n'
- write_string(output, self._err_file)
+ self._write_string(output, self._err_file)
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
@@ -309,21 +319,21 @@ class YoutubeDL(object):
# already of type unicode()
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
elif 'TERM' in os.environ:
- write_string('\033]0;%s\007' % message, self._screen_file)
+ self._write_string('\033]0;%s\007' % message, self._screen_file)
def save_console_title(self):
if not self.params.get('consoletitle', False):
return
if 'TERM' in os.environ:
# Save the title on stack
- write_string('\033[22;0t', self._screen_file)
+ self._write_string('\033[22;0t', self._screen_file)
def restore_console_title(self):
if not self.params.get('consoletitle', False):
return
if 'TERM' in os.environ:
# Restore the title from stack
- write_string('\033[23;0t', self._screen_file)
+ self._write_string('\033[23;0t', self._screen_file)
def __enter__(self):
self.save_console_title()
@@ -370,12 +380,17 @@ class YoutubeDL(object):
Print the message to stderr, it will be prefixed with 'WARNING:'
If stderr is a tty file the 'WARNING:' will be colored
'''
- if self._err_file.isatty() and os.name != 'nt':
- _msg_header = '\033[0;33mWARNING:\033[0m'
+ if self.params.get('logger') is not None:
+ self.params['logger'].warning(message)
else:
- _msg_header = 'WARNING:'
- warning_message = '%s %s' % (_msg_header, message)
- self.to_stderr(warning_message)
+ if self.params.get('no_warnings'):
+ return
+ if self._err_file.isatty() and os.name != 'nt':
+ _msg_header = '\033[0;33mWARNING:\033[0m'
+ else:
+ _msg_header = 'WARNING:'
+ warning_message = '%s %s' % (_msg_header, message)
+ self.to_stderr(warning_message)
def report_error(self, message, tb=None):
'''
@@ -409,6 +424,13 @@ class YoutubeDL(object):
template_dict['autonumber'] = autonumber_templ % self._num_downloads
if template_dict.get('playlist_index') is not None:
template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
+ if template_dict.get('resolution') is None:
+ if template_dict.get('width') and template_dict.get('height'):
+ template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+ elif template_dict.get('height'):
+ template_dict['resolution'] = '%sp' % template_dict['height']
+ elif template_dict.get('width'):
+ template_dict['resolution'] = '?x%d' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
@@ -419,7 +441,8 @@ class YoutubeDL(object):
if v is not None)
template_dict = collections.defaultdict(lambda: 'NA', template_dict)
- tmpl = os.path.expanduser(self.params['outtmpl'])
+ outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
+ tmpl = os.path.expanduser(outtmpl)
filename = tmpl % template_dict
return filename
except ValueError as err:
@@ -499,13 +522,7 @@ class YoutubeDL(object):
'_type': 'compat_list',
'entries': ie_result,
}
- self.add_extra_info(ie_result,
- {
- 'extractor': ie.IE_NAME,
- 'webpage_url': url,
- 'webpage_url_basename': url_basename(url),
- 'extractor_key': ie.ie_key(),
- })
+ self.add_default_extra_info(ie_result, ie, url)
if process:
return self.process_ie_result(ie_result, download, extra_info)
else:
@@ -522,7 +539,15 @@ class YoutubeDL(object):
else:
raise
else:
- self.report_error('no suitable InfoExtractor: %s' % url)
+ self.report_error('no suitable InfoExtractor for URL %s' % url)
+
+ def add_default_extra_info(self, ie_result, ie, url):
+ self.add_extra_info(ie_result, {
+ 'extractor': ie.IE_NAME,
+ 'webpage_url': url,
+ 'webpage_url_basename': url_basename(url),
+ 'extractor_key': ie.ie_key(),
+ })
def process_ie_result(self, ie_result, download=True, extra_info={}):
"""
@@ -656,6 +681,18 @@ class YoutubeDL(object):
if f.get('vcodec') == 'none']
if audio_formats:
return audio_formats[0]
+ elif format_spec == 'bestvideo':
+ video_formats = [
+ f for f in available_formats
+ if f.get('acodec') == 'none']
+ if video_formats:
+ return video_formats[-1]
+ elif format_spec == 'worstvideo':
+ video_formats = [
+ f for f in available_formats
+ if f.get('acodec') == 'none']
+ if video_formats:
+ return video_formats[0]
else:
extensions = ['mp4', 'flv', 'webm', '3gp']
if format_spec in extensions:
@@ -670,11 +707,35 @@ class YoutubeDL(object):
def process_video_result(self, info_dict, download=True):
assert info_dict.get('_type', 'video') == 'video'
+ if 'id' not in info_dict:
+ raise ExtractorError('Missing "id" field in extractor result')
+ if 'title' not in info_dict:
+ raise ExtractorError('Missing "title" field in extractor result')
+
if 'playlist' not in info_dict:
# It isn't part of a playlist
info_dict['playlist'] = None
info_dict['playlist_index'] = None
+ thumbnails = info_dict.get('thumbnails')
+ if thumbnails:
+ thumbnails.sort(key=lambda t: (
+ t.get('width'), t.get('height'), t.get('url')))
+ for t in thumbnails:
+ if 'width' in t and 'height' in t:
+ t['resolution'] = '%dx%d' % (t['width'], t['height'])
+
+ if thumbnails and 'thumbnail' not in info_dict:
+ info_dict['thumbnail'] = thumbnails[-1]['url']
+
+ if 'display_id' not in info_dict and 'id' in info_dict:
+ info_dict['display_id'] = info_dict['id']
+
+ if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
+ upload_date = datetime.datetime.utcfromtimestamp(
+ info_dict['timestamp'])
+ info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+
# This extractors handle format selection themselves
if info_dict['extractor'] in ['Youku']:
if download:
@@ -688,8 +749,14 @@ class YoutubeDL(object):
else:
formats = info_dict['formats']
+ if not formats:
+ raise ExtractorError('No video formats found!')
+
# We check that all the formats have the format and format_id fields
- for (i, format) in enumerate(formats):
+ for i, format in enumerate(formats):
+ if 'url' not in format:
+ raise ExtractorError('Missing "url" key in result (index %d)' % i)
+
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
if format.get('format') is None:
@@ -700,7 +767,7 @@ class YoutubeDL(object):
)
# Automatically determine file extension if missing
if 'ext' not in format:
- format['ext'] = determine_ext(format['url'])
+ format['ext'] = determine_ext(format['url']).lower()
format_limit = self.params.get('format_limit', None)
if format_limit:
@@ -825,7 +892,7 @@ class YoutubeDL(object):
try:
dn = os.path.dirname(encodeFilename(filename))
- if dn != '' and not os.path.exists(dn):
+ if dn and not os.path.exists(dn):
os.makedirs(dn)
except (OSError, IOError) as err:
self.report_error('unable to create directory ' + compat_str(err))
@@ -882,7 +949,7 @@ class YoutubeDL(object):
with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
subfile.write(sub)
except (OSError, IOError):
- self.report_error('Cannot write subtitles file ' + descfn)
+ self.report_error('Cannot write subtitles file ' + sub_filename)
return
if self.params.get('writeinfojson', False):
@@ -908,7 +975,7 @@ class YoutubeDL(object):
self.to_screen('[%s] %s: Downloading thumbnail ...' %
(info_dict['extractor'], info_dict['id']))
try:
- uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
+ uf = self.urlopen(info_dict['thumbnail'])
with open(thumb_filename, 'wb') as thumbf:
shutil.copyfileobj(uf, thumbf)
self.to_screen('[%s] %s: Writing thumbnail to: %s' %
@@ -971,10 +1038,11 @@ class YoutubeDL(object):
def download(self, url_list):
"""Download a given list of URLs."""
+ outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
if (len(url_list) > 1 and
- '%' not in self.params['outtmpl']
+ '%' not in outtmpl
and self.params.get('max_downloads') != 1):
- raise SameFileError(self.params['outtmpl'])
+ raise SameFileError(outtmpl)
for url in url_list:
try:
@@ -1085,57 +1153,57 @@ class YoutubeDL(object):
res = default
return res
- def list_formats(self, info_dict):
- def format_note(fdict):
- res = ''
- if fdict.get('ext') in ['f4f', 'f4m']:
- res += '(unsupported) '
- if fdict.get('format_note') is not None:
- res += fdict['format_note'] + ' '
- if fdict.get('tbr') is not None:
- res += '%4dk ' % fdict['tbr']
- if fdict.get('container') is not None:
- if res:
- res += ', '
- res += '%s container' % fdict['container']
- if (fdict.get('vcodec') is not None and
- fdict.get('vcodec') != 'none'):
- if res:
- res += ', '
- res += fdict['vcodec']
- if fdict.get('vbr') is not None:
- res += '@'
- elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
- res += 'video@'
+ def _format_note(self, fdict):
+ res = ''
+ if fdict.get('ext') in ['f4f', 'f4m']:
+ res += '(unsupported) '
+ if fdict.get('format_note') is not None:
+ res += fdict['format_note'] + ' '
+ if fdict.get('tbr') is not None:
+ res += '%4dk ' % fdict['tbr']
+ if fdict.get('container') is not None:
+ if res:
+ res += ', '
+ res += '%s container' % fdict['container']
+ if (fdict.get('vcodec') is not None and
+ fdict.get('vcodec') != 'none'):
+ if res:
+ res += ', '
+ res += fdict['vcodec']
if fdict.get('vbr') is not None:
- res += '%4dk' % fdict['vbr']
- if fdict.get('acodec') is not None:
- if res:
- res += ', '
- if fdict['acodec'] == 'none':
- res += 'video only'
- else:
- res += '%-5s' % fdict['acodec']
- elif fdict.get('abr') is not None:
- if res:
- res += ', '
- res += 'audio'
- if fdict.get('abr') is not None:
- res += '@%3dk' % fdict['abr']
- if fdict.get('asr') is not None:
- res += ' (%5dHz)' % fdict['asr']
- if fdict.get('filesize') is not None:
- if res:
- res += ', '
- res += format_bytes(fdict['filesize'])
- return res
+ res += '@'
+ elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
+ res += 'video@'
+ if fdict.get('vbr') is not None:
+ res += '%4dk' % fdict['vbr']
+ if fdict.get('acodec') is not None:
+ if res:
+ res += ', '
+ if fdict['acodec'] == 'none':
+ res += 'video only'
+ else:
+ res += '%-5s' % fdict['acodec']
+ elif fdict.get('abr') is not None:
+ if res:
+ res += ', '
+ res += 'audio'
+ if fdict.get('abr') is not None:
+ res += '@%3dk' % fdict['abr']
+ if fdict.get('asr') is not None:
+ res += ' (%5dHz)' % fdict['asr']
+ if fdict.get('filesize') is not None:
+ if res:
+ res += ', '
+ res += format_bytes(fdict['filesize'])
+ return res
+ def list_formats(self, info_dict):
def line(format, idlen=20):
return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
format['format_id'],
format['ext'],
self.format_resolution(format),
- format_note(format),
+ self._format_note(format),
))
formats = info_dict.get('formats', [info_dict])
@@ -1143,8 +1211,8 @@ class YoutubeDL(object):
max(len(f['format_id']) for f in formats))
formats_s = [line(f, idlen) for f in formats]
if len(formats) > 1:
- formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
- formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
+ formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
+ formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
header_line = line({
'format_id': 'format code', 'ext': 'extension',
@@ -1154,12 +1222,22 @@ class YoutubeDL(object):
def urlopen(self, req):
""" Start an HTTP download """
- return self._opener.open(req)
+ return self._opener.open(req, timeout=self._socket_timeout)
def print_debug_header(self):
if not self.params.get('verbose'):
return
- write_string('[debug] youtube-dl version ' + __version__ + '\n')
+
+ write_string(
+ '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
+ locale.getpreferredencoding(),
+ sys.getfilesystemencoding(),
+ sys.stdout.encoding,
+ self.get_encoding()),
+ encoding=None
+ )
+
+ self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
try:
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
@@ -1168,24 +1246,24 @@ class YoutubeDL(object):
out, err = sp.communicate()
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
- write_string('[debug] Git HEAD: ' + out + '\n')
+ self._write_string('[debug] Git HEAD: ' + out + '\n')
except:
try:
sys.exc_clear()
except:
pass
- write_string('[debug] Python version %s - %s' %
+ self._write_string('[debug] Python version %s - %s' %
(platform.python_version(), platform_name()) + '\n')
proxy_map = {}
for handler in self._opener.handlers:
if hasattr(handler, 'proxies'):
proxy_map.update(handler.proxies)
- write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
+ self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
def _setup_opener(self):
timeout_val = self.params.get('socket_timeout')
- timeout = 600 if timeout_val is None else float(timeout_val)
+ self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
opts_cookiefile = self.params.get('cookiefile')
opts_proxy = self.params.get('proxy')
@@ -1224,6 +1302,18 @@ class YoutubeDL(object):
opener.addheaders = []
self._opener = opener
- # TODO remove this global modification
- compat_urllib_request.install_opener(opener)
- socket.setdefaulttimeout(timeout)
+ def encode(self, s):
+ if isinstance(s, bytes):
+ return s # Already encoded
+
+ try:
+ return s.encode(self.get_encoding())
+ except UnicodeEncodeError as err:
+ err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
+ raise
+
+ def get_encoding(self):
+ encoding = self.params.get('encoding')
+ if encoding is None:
+ encoding = preferredencoding()
+ return encoding
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index f843036..1e01432 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -46,12 +46,25 @@ __authors__ = (
'Andreas Schmitz',
'Michael Kaiser',
'Niklas Laxström',
+ 'David Triendl',
+ 'Anthony Weems',
+ 'David Wagner',
+ 'Juan C. Olivares',
+ 'Mattias Harrysson',
+ 'phaer',
+ 'Sainyam Kapoor',
+ 'Nicolas Évrard',
+ 'Jason Normore',
+ 'Hoje Lee',
+ 'Adam Thalhammer',
+ 'Georg Jähnig',
+ 'Ralf Haring',
)
__license__ = 'Public Domain'
import codecs
-import getpass
+import io
import locale
import optparse
import os
@@ -62,14 +75,17 @@ import sys
from .utils import (
+ compat_getpass,
compat_print,
DateRange,
+ DEFAULT_OUTTMPL,
decodeOption,
get_term_width,
DownloadError,
get_cachedir,
MaxDownloadsReached,
preferredencoding,
+ read_batch_urls,
SameFileError,
setproctitle,
std_headers,
@@ -83,6 +99,8 @@ from .extractor import gen_extractors
from .version import __version__
from .YoutubeDL import YoutubeDL
from .postprocessor import (
+ AtomicParsleyPP,
+ FFmpegAudioFixPP,
FFmpegMetadataPP,
FFmpegVideoConvertor,
FFmpegExtractAudioPP,
@@ -208,7 +226,7 @@ def parseOpts(overrideArguments=None):
general.add_option('-U', '--update',
action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
general.add_option('-i', '--ignore-errors',
- action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False)
+ action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False)
general.add_option('--abort-on-error',
action='store_false', dest='ignoreerrors',
help='Abort downloading of further videos (in the playlist or the command line) if an error occurs')
@@ -220,6 +238,9 @@ def parseOpts(overrideArguments=None):
general.add_option('--referer',
dest='referer', help='specify a custom referer, use if the video access is restricted to one domain',
metavar='REF', default=None)
+ general.add_option('--add-header',
+ dest='headers', help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', action="append",
+ metavar='FIELD:VALUE')
general.add_option('--list-extractors',
action='store_true', dest='list_extractors',
help='List all supported extractors and the URLs they would handle', default=False)
@@ -231,6 +252,9 @@ def parseOpts(overrideArguments=None):
help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option(
+ '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',
+ help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
+ general.add_option(
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
general.add_option(
@@ -242,14 +266,17 @@ def parseOpts(overrideArguments=None):
general.add_option(
'--bidi-workaround', dest='bidi_workaround', action='store_true',
help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
- general.add_option('--default-search',
- dest='default_search', metavar='PREFIX',
- help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
+ general.add_option(
+ '--default-search',
+ dest='default_search', metavar='PREFIX',
+ help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
general.add_option(
'--ignore-config',
action='store_true',
help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
-
+ general.add_option(
+ '--encoding', dest='encoding', metavar='ENCODING',
+ help='Force the specified encoding (experimental)')
selection.add_option(
'--playlist-start',
@@ -309,7 +336,7 @@ def parseOpts(overrideArguments=None):
video_format.add_option('-f', '--format',
action='store', dest='format', metavar='FORMAT', default=None,
- help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", "worst", and "worstaudio". By default, youtube-dl will pick the best quality.')
+ help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.')
video_format.add_option('--all-formats',
action='store_const', dest='format', help='download all available video formats', const='all')
video_format.add_option('--prefer-free-formats',
@@ -352,6 +379,10 @@ def parseOpts(overrideArguments=None):
verbosity.add_option('-q', '--quiet',
action='store_true', dest='quiet', help='activates quiet mode', default=False)
+ verbosity.add_option(
+ '--no-warnings',
+ dest='no_warnings', action='store_true', default=False,
+ help='Ignore warnings')
verbosity.add_option('-s', '--simulate',
action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
verbosity.add_option('--skip-download',
@@ -379,7 +410,7 @@ def parseOpts(overrideArguments=None):
help='simulate, quiet but print output format', default=False)
verbosity.add_option('-j', '--dump-json',
action='store_true', dest='dumpjson',
- help='simulate, quiet but print JSON information', default=False)
+ help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
verbosity.add_option('--newline',
action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
verbosity.add_option('--no-progress',
@@ -424,6 +455,8 @@ def parseOpts(overrideArguments=None):
'%(extractor)s for the provider (youtube, metacafe, etc), '
'%(id)s for the video id, %(playlist)s for the playlist the video is in, '
'%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+ '%(height)s and %(width)s for the width and height of the video format. '
+ '%(resolution)s for a textual description of the resolution of the video format. '
'Use - to output to stdout. Can also be used to download to a different directory, '
'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
filesystem.add_option('--autonumber-size',
@@ -479,6 +512,8 @@ def parseOpts(overrideArguments=None):
help='do not overwrite post-processed files; the post-processed files are overwritten by default')
postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False,
help='embed subtitles in the video (only for mp4 videos)')
+ postproc.add_option('--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False,
+ help='embed thumbnail in the audio as cover art')
postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False,
help='write metadata to the video file')
postproc.add_option('--xattrs', action='store_true', dest='xattrs', default=False,
@@ -521,8 +556,6 @@ def parseOpts(overrideArguments=None):
write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
- write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' %
- (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding()))
return parser, opts, args
@@ -545,27 +578,35 @@ def _real_main(argv=None):
if opts.referer is not None:
std_headers['Referer'] = opts.referer
+ # Custom HTTP headers
+ if opts.headers is not None:
+ for h in opts.headers:
+ if h.find(':', 1) < 0:
+ parser.error(u'wrong header formatting, it should be key:value, not "%s"'%h)
+ key, value = h.split(':', 2)
+ if opts.verbose:
+ write_string(u'[debug] Adding header from command line option %s:%s\n'%(key, value))
+ std_headers[key] = value
+
# Dump user agent
if opts.dump_user_agent:
compat_print(std_headers['User-Agent'])
sys.exit(0)
# Batch file verification
- batchurls = []
+ batch_urls = []
if opts.batchfile is not None:
try:
if opts.batchfile == '-':
batchfd = sys.stdin
else:
- batchfd = open(opts.batchfile, 'r')
- batchurls = batchfd.readlines()
- batchurls = [x.strip() for x in batchurls]
- batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
+ batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+ batch_urls = read_batch_urls(batchfd)
if opts.verbose:
- write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
+ write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n')
except IOError:
sys.exit(u'ERROR: batch file could not be read')
- all_urls = batchurls + args
+ all_urls = batch_urls + args
all_urls = [url.strip() for url in all_urls]
_enc = preferredencoding()
all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
@@ -604,7 +645,7 @@ def _real_main(argv=None):
if opts.usetitle and opts.useid:
parser.error(u'using title conflicts with using video ID')
if opts.username is not None and opts.password is None:
- opts.password = getpass.getpass(u'Type account password and press return:')
+ opts.password = compat_getpass(u'Type account password and press [Return]: ')
if opts.ratelimit is not None:
numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
if numeric_limit is None:
@@ -642,13 +683,13 @@ def _real_main(argv=None):
if not opts.audioquality.isdigit():
parser.error(u'invalid audio quality specified')
if opts.recodevideo is not None:
- if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg']:
+ if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
parser.error(u'invalid video recode format specified')
if opts.date is not None:
date = DateRange.day(opts.date)
else:
date = DateRange(opts.dateafter, opts.datebefore)
- if opts.default_search not in ('auto', None) and ':' not in opts.default_search:
+ if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search:
parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')
# Do not download videos when there are audio-only formats
@@ -671,7 +712,7 @@ def _real_main(argv=None):
or (opts.usetitle and u'%(title)s-%(id)s.%(ext)s')
or (opts.useid and u'%(id)s.%(ext)s')
or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
- or u'%(title)s-%(id)s.%(ext)s')
+ or DEFAULT_OUTTMPL)
if not os.path.splitext(outtmpl)[1] and opts.extractaudio:
parser.error(u'Cannot download a video and extract audio into the same'
u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
@@ -686,6 +727,7 @@ def _real_main(argv=None):
'password': opts.password,
'videopassword': opts.videopassword,
'quiet': (opts.quiet or any_printing),
+ 'no_warnings': opts.no_warnings,
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,
@@ -749,6 +791,7 @@ def _real_main(argv=None):
'download_archive': download_archive_fn,
'cookiefile': opts.cookiefile,
'nocheckcertificate': opts.no_check_certificate,
+ 'prefer_insecure': opts.prefer_insecure,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
'bidi_workaround': opts.bidi_workaround,
@@ -757,6 +800,7 @@ def _real_main(argv=None):
'include_ads': opts.include_ads,
'default_search': opts.default_search,
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
+ 'encoding': opts.encoding,
}
with YoutubeDL(ydl_opts) as ydl:
@@ -775,6 +819,10 @@ def _real_main(argv=None):
ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
if opts.xattrs:
ydl.add_post_processor(XAttrMetadataPP())
+ if opts.embedthumbnail:
+ if not opts.addmetadata:
+ ydl.add_post_processor(FFmpegAudioFixPP())
+ ydl.add_post_processor(AtomicParsleyPP())
# Update version
if opts.update_self:
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 5a068aa..917f345 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -4,9 +4,10 @@ import sys
import time
from ..utils import (
+ compat_str,
encodeFilename,
- timeconvert,
format_bytes,
+ timeconvert,
)
@@ -173,7 +174,7 @@ class FileDownloader(object):
return
os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
except (IOError, OSError) as err:
- self.report_error(u'unable to rename file: %s' % str(err))
+ self.report_error(u'unable to rename file: %s' % compat_str(err))
def try_utime(self, filename, last_modified_hdr):
"""Try to set the last-modified time of the given file."""
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 2a870a7..e6be6ae 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -12,7 +12,6 @@ from .http import HttpFD
from ..utils import (
struct_pack,
struct_unpack,
- compat_urllib_request,
compat_urlparse,
format_bytes,
encodeFilename,
@@ -117,8 +116,8 @@ class FlvReader(io.BytesIO):
self.read_unsigned_char()
# flags
self.read(3)
- # BootstrapinfoVersion
- bootstrap_info_version = self.read_unsigned_int()
+
+ self.read_unsigned_int() # BootstrapinfoVersion
# Profile,Live,Update,Reserved
self.read(1)
# time scale
@@ -127,15 +126,15 @@ class FlvReader(io.BytesIO):
self.read_unsigned_long_long()
# SmpteTimeCodeOffset
self.read_unsigned_long_long()
- # MovieIdentifier
- movie_identifier = self.read_string()
+
+ self.read_string() # MovieIdentifier
server_count = self.read_unsigned_char()
# ServerEntryTable
for i in range(server_count):
self.read_string()
quality_count = self.read_unsigned_char()
# QualityEntryTable
- for i in range(server_count):
+ for i in range(quality_count):
self.read_string()
# DrmData
self.read_string()
@@ -298,6 +297,7 @@ class F4mFD(FileDownloader):
break
frags_filenames.append(frag_filename)
+ dest_stream.close()
self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start)
self.try_rename(tmpfilename, filename)
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index fa98346..9d407fe 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -13,8 +13,10 @@ class HlsFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
- args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
- '-bsf:a', 'aac_adtstoasc', tmpfilename]
+ args = [
+ '-y', '-i', url, '-f', 'mp4', '-c', 'copy',
+ '-bsf:a', 'aac_adtstoasc',
+ encodeFilename(tmpfilename, for_subprocess=True)]
for program in ['avconv', 'ffmpeg']:
try:
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index 748f9f3..f79e6a9 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -14,6 +14,8 @@ from ..utils import (
class HttpFD(FileDownloader):
+ _TEST_FILE_SIZE = 10241
+
def real_download(self, filename, info_dict):
url = info_dict['url']
tmpfilename = self.temp_name(filename)
@@ -23,11 +25,15 @@ class HttpFD(FileDownloader):
headers = {'Youtubedl-no-compression': 'True'}
if 'user_agent' in info_dict:
headers['Youtubedl-user-agent'] = info_dict['user_agent']
+ if 'http_referer' in info_dict:
+ headers['Referer'] = info_dict['http_referer']
basic_request = compat_urllib_request.Request(url, None, headers)
request = compat_urllib_request.Request(url, None, headers)
- if self.params.get('test', False):
- request.add_header('Range', 'bytes=0-10240')
+ is_test = self.params.get('test', False)
+
+ if is_test:
+ request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1))
# Establish possible resume length
if os.path.isfile(encodeFilename(tmpfilename)):
@@ -49,7 +55,7 @@ class HttpFD(FileDownloader):
while count <= retries:
# Establish connection
try:
- data = compat_urllib_request.urlopen(request)
+ data = self.ydl.urlopen(request)
break
except (compat_urllib_error.HTTPError, ) as err:
if (err.code < 500 or err.code >= 600) and err.code != 416:
@@ -59,7 +65,7 @@ class HttpFD(FileDownloader):
# Unable to resume (requested range not satisfiable)
try:
# Open the connection again without the range header
- data = compat_urllib_request.urlopen(basic_request)
+ data = self.ydl.urlopen(basic_request)
content_length = data.info()['Content-Length']
except (compat_urllib_error.HTTPError, ) as err:
if err.code < 500 or err.code >= 600:
@@ -85,6 +91,7 @@ class HttpFD(FileDownloader):
else:
# The length does not match, we start the download over
self.report_unable_to_resume()
+ resume_len = 0
open_mode = 'wb'
break
# Retry
@@ -97,6 +104,15 @@ class HttpFD(FileDownloader):
return False
data_len = data.info().get('Content-length', None)
+
+ # Range HTTP header may be ignored/unsupported by a webserver
+ # (e.g. extractor/scivee.py, extractor/bambuser.py).
+ # However, for a test we still would like to download just a piece of a file.
+ # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
+ # block size when downloading a file.
+ if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
+ data_len = self._TEST_FILE_SIZE
+
if data_len is not None:
data_len = int(data_len) + resume_len
min_data_len = self.params.get("min_filesize", None)
@@ -115,7 +131,7 @@ class HttpFD(FileDownloader):
while True:
# Download and write
before = time.time()
- data_block = data.read(block_size)
+ data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
after = time.time()
if len(data_block) == 0:
break
@@ -159,6 +175,9 @@ class HttpFD(FileDownloader):
'speed': speed,
})
+ if is_test and byte_counter == data_len:
+ break
+
# Apply rate limit
self.slow_down(start, byte_counter - resume_len)
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index e93c28d..cc6a841 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import os
import re
import subprocess
@@ -8,6 +10,7 @@ from .common import FileDownloader
from ..utils import (
encodeFilename,
format_bytes,
+ compat_str,
)
@@ -22,7 +25,7 @@ class RtmpFD(FileDownloader):
proc_stderr_closed = False
while not proc_stderr_closed:
# read line from stderr
- line = u''
+ line = ''
while True:
char = proc.stderr.read(1)
if not char:
@@ -46,7 +49,7 @@ class RtmpFD(FileDownloader):
data_len = None
if percent > 0:
data_len = int(downloaded_data_len * 100 / percent)
- data_len_str = u'~' + format_bytes(data_len)
+ data_len_str = '~' + format_bytes(data_len)
self.report_progress(percent, data_len_str, speed, eta)
cursor_in_new_line = False
self._hook_progress({
@@ -76,12 +79,12 @@ class RtmpFD(FileDownloader):
})
elif self.params.get('verbose', False):
if not cursor_in_new_line:
- self.to_screen(u'')
+ self.to_screen('')
cursor_in_new_line = True
- self.to_screen(u'[rtmpdump] '+line)
+ self.to_screen('[rtmpdump] '+line)
proc.wait()
if not cursor_in_new_line:
- self.to_screen(u'')
+ self.to_screen('')
return proc.returncode
url = info_dict['url']
@@ -93,6 +96,7 @@ class RtmpFD(FileDownloader):
flash_version = info_dict.get('flash_version', None)
live = info_dict.get('rtmp_live', False)
conn = info_dict.get('rtmp_conn', None)
+ protocol = info_dict.get('rtmp_protocol', None)
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
@@ -102,7 +106,7 @@ class RtmpFD(FileDownloader):
try:
subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
except (OSError, IOError):
- self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
+ self.report_error('RTMP download detected but "rtmpdump" could not be run')
return False
# Download using rtmpdump. rtmpdump returns exit code 2 when
@@ -125,9 +129,14 @@ class RtmpFD(FileDownloader):
basic_args += ['--flashVer', flash_version]
if live:
basic_args += ['--live']
- if conn:
+ if isinstance(conn, list):
+ for entry in conn:
+ basic_args += ['--conn', entry]
+ elif isinstance(conn, compat_str):
basic_args += ['--conn', conn]
- args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
+ if protocol is not None:
+ basic_args += ['--protocol', protocol]
+ args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)]
if sys.platform == 'win32' and sys.version_info < (3, 0):
# Windows subprocess module does not actually support Unicode
@@ -150,26 +159,35 @@ class RtmpFD(FileDownloader):
shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
except ImportError:
shell_quote = repr
- self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
+ self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args))
+
+ RD_SUCCESS = 0
+ RD_FAILED = 1
+ RD_INCOMPLETE = 2
+ RD_NO_CONNECT = 3
retval = run_rtmpdump(args)
- while (retval == 2 or retval == 1) and not test:
+ if retval == RD_NO_CONNECT:
+ self.report_error('[rtmpdump] Could not connect to RTMP server.')
+ return False
+
+ while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
+ self.to_screen('[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
- retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+ retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
cursize = os.path.getsize(encodeFilename(tmpfilename))
- if prevsize == cursize and retval == 1:
+ if prevsize == cursize and retval == RD_FAILED:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
- if prevsize == cursize and retval == 2 and cursize > 1024:
- self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
- retval = 0
+ if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
+ self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+ retval = RD_SUCCESS
break
- if retval == 0 or (test and retval == 2):
+ if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE):
fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'[rtmpdump] %s bytes' % fsize)
+ self.to_screen('[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
@@ -179,6 +197,6 @@ class RtmpFD(FileDownloader):
})
return True
else:
- self.to_stderr(u"\n")
- self.report_error(u'rtmpdump exited with code %d' % retval)
+ self.to_stderr('\n')
+ self.report_error('rtmpdump exited with code %d' % retval)
return False
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 7253718..15a42ce 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,6 +1,8 @@
from .academicearth import AcademicEarthCourseIE
from .addanime import AddAnimeIE
+from .aftonbladet import AftonbladetIE
from .anitube import AnitubeIE
+from .aol import AolIE
from .aparat import AparatIE
from .appletrailers import AppleTrailersIE
from .archiveorg import ArchiveOrgIE
@@ -9,29 +11,39 @@ from .arte import (
ArteTvIE,
ArteTVPlus7IE,
ArteTVCreativeIE,
+ ArteTVConcertIE,
ArteTVFutureIE,
ArteTVDDCIE,
+ ArteTVEmbedIE,
)
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
from .bbccouk import BBCCoUkIE
+from .bilibili import BiliBiliIE
from .blinkx import BlinkxIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
+from .br import BRIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
+from .byutv import BYUtvIE
from .c56 import C56IE
+from .canal13cl import Canal13clIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cbs import CBSIE
+from .cbsnews import CBSNewsIE
+from .ceskatelevize import CeskaTelevizeIE
from .channel9 import Channel9IE
from .chilloutzone import ChilloutzoneIE
from .cinemassacre import CinemassacreIE
from .clipfish import ClipfishIE
from .cliphunter import CliphunterIE
from .clipsyndicate import ClipsyndicateIE
+from .clubic import ClubicIE
from .cmt import CMTIE
+from .cnet import CNETIE
from .cnn import (
CNNIE,
CNNBlogsIE,
@@ -49,31 +61,36 @@ from .dailymotion import (
DailymotionUserIE,
)
from .daum import DaumIE
-from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
from .defense import DefenseGouvFrIE
from .discovery import DiscoveryIE
+from .divxstage import DivxStageIE
from .dropbox import DropboxIE
from .ebaumsworld import EbaumsWorldIE
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .eitb import EitbIE
from .elpais import ElPaisIE
+from .empflix import EmpflixIE
+from .engadget import EngadgetIE
from .escapist import EscapistIE
from .everyonesmixtape import EveryonesMixtapeIE
from .exfm import ExfmIE
from .extremetube import ExtremeTubeIE
from .facebook import FacebookIE
from .faz import FazIE
+from .fc2 import FC2IE
from .firstpost import FirstpostIE
from .firsttv import FirstTVIE
+from .fivemin import FiveMinIE
from .fktv import (
FKTVIE,
FKTVPosteckeIE,
)
from .flickr import FlickrIE
from .fourtube import FourTubeIE
+from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
PluzzIE,
@@ -88,15 +105,18 @@ from .funnyordie import FunnyOrDieIE
from .gamekings import GamekingsIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
+from .gdcvault import GDCVaultIE
from .generic import GenericIE
from .googleplus import GooglePlusIE
from .googlesearch import GoogleSearchIE
from .hark import HarkIE
from .helsinki import HelsinkiIE
+from .hentaistigma import HentaiStigmaIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .huffpost import HuffPostIE
from .hypem import HypemIE
+from .iconosquare import IconosquareIE
from .ign import IGNIE, OneUPIE
from .imdb import (
ImdbIE,
@@ -104,7 +124,7 @@ from .imdb import (
)
from .ina import InaIE
from .infoq import InfoQIE
-from .instagram import InstagramIE
+from .instagram import InstagramIE, InstagramUserIE
from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE
from .ivi import (
@@ -122,6 +142,7 @@ from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .kontrtube import KontrTubeIE
+from .ku6 import Ku6IE
from .la7 import LA7IE
from .lifenews import LifeNewsIE
from .liveleak import LiveLeakIE
@@ -132,45 +153,67 @@ from .lynda import (
)
from .m6 import M6IE
from .macgamestore import MacGameStoreIE
+from .mailru import MailRuIE
from .malemotion import MalemotionIE
from .mdr import MDRIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
-from .mit import TechTVMITIE, MITIE
+from .mit import TechTVMITIE, MITIE, OCWMITIE
from .mixcloud import MixcloudIE
from .mpora import MporaIE
from .mofosex import MofosexIE
from .mooshare import MooshareIE
+from .morningstar import MorningstarIE
+from .motorsport import MotorsportIE
+from .moviezine import MoviezineIE
+from .movshare import MovShareIE
from .mtv import (
MTVIE,
MTVIggyIE,
)
+from .musicplayon import MusicPlayOnIE
from .muzu import MuzuTVIE
from .myspace import MySpaceIE
from .myspass import MySpassIE
from .myvideo import MyVideoIE
from .naver import NaverIE
from .nba import NBAIE
-from .nbc import NBCNewsIE
+from .nbc import (
+ NBCIE,
+ NBCNewsIE,
+)
from .ndr import NDRIE
from .ndtv import NDTVIE
from .newgrounds import NewgroundsIE
+from .newstube import NewstubeIE
from .nfb import NFBIE
from .nhl import NHLIE, NHLVideocenterIE
from .niconico import NiconicoIE
from .ninegag import NineGagIE
+from .noco import NocoIE
from .normalboots import NormalbootsIE
-from .novamov import NovamovIE
+from .novamov import NovaMovIE
from .nowness import NownessIE
from .nowvideo import NowVideoIE
+from .nrk import (
+ NRKIE,
+ NRKTVIE,
+)
+from .ntv import NTVIE
+from .nytimes import NYTimesIE
+from .nuvid import NuvidIE
+from .oe1 import OE1IE
from .ooyala import OoyalaIE
from .orf import ORFIE
+from .parliamentliveuk import ParliamentLiveUKIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
+from .playvid import PlayvidIE
from .podomatic import PodomaticIE
from .pornhd import PornHdIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
+from .prosiebensat1 import ProSiebenSat1IE
from .pyvideo import PyvideoIE
from .radiofrance import RadioFranceIE
from .rbmaradio import RBMARadioIE
@@ -179,17 +222,23 @@ from .ringtv import RingTVIE
from .ro220 import Ro220IE
from .rottentomatoes import RottenTomatoesIE
from .roxwel import RoxwelIE
+from .rtbf import RTBFIE
from .rtlnow import RTLnowIE
+from .rts import RTSIE
+from .rtve import RTVEALaCartaIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
RutubeMovieIE,
RutubePersonIE,
)
+from .rutv import RUTVIE
+from .savefrom import SaveFromIE
+from .scivee import SciVeeIE
from .servingsys import ServingSysIE
from .sina import SinaIE
-from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
+from .slutload import SlutloadIE
from .smotri import (
SmotriIE,
SmotriCommunityIE,
@@ -197,7 +246,12 @@ from .smotri import (
SmotriBroadcastIE,
)
from .sohu import SohuIE
-from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
+from .soundcloud import (
+ SoundcloudIE,
+ SoundcloudSetIE,
+ SoundcloudUserIE,
+ SoundcloudPlaylistIE
+)
from .southparkstudios import (
SouthParkStudiosIE,
SouthparkDeIE,
@@ -205,41 +259,62 @@ from .southparkstudios import (
from .space import SpaceIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE
+from .spiegeltv import SpiegeltvIE
from .spike import SpikeIE
from .stanfordoc import StanfordOpenClassroomIE
-from .statigram import StatigramIE
from .steam import SteamIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
+from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
+from .tagesschau import TagesschauIE
+from .teachertube import (
+ TeacherTubeIE,
+ TeacherTubeClassroomIE,
+)
+from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
+from .testurl import TestURLIE
from .tf1 import TF1IE
from .theplatform import ThePlatformIE
from .thisav import ThisAVIE
from .tinypic import TinyPicIE
+from .tlc import TlcIE, TlcDeIE
from .toutv import TouTvIE
+from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
+from .trutube import TruTubeIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
+from .tvigle import TvigleIE
from .tvp import TvpIE
+from .udemy import (
+ UdemyIE,
+ UdemyCourseIE
+)
from .unistra import UnistraIE
+from .urort import UrortIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .veehd import VeeHDIE
from .veoh import VeohIE
from .vesti import VestiIE
from .vevo import VevoIE
-from .vice import ViceIE
+from .vh1 import VH1IE
from .viddler import ViddlerIE
+from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
+from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
+from .videott import VideoTtIE
+from .videoweed import VideoWeedIE
from .vimeo import (
VimeoIE,
VimeoChannelIE,
@@ -247,20 +322,32 @@ from .vimeo import (
VimeoAlbumIE,
VimeoGroupsIE,
VimeoReviewIE,
+ VimeoWatchLaterIE,
+)
+from .vine import (
+ VineIE,
+ VineUserIE,
)
-from .vine import VineIE
from .viki import VikiIE
from .vk import VKIE
from .vube import VubeIE
+from .vuclip import VuClipIE
+from .washingtonpost import WashingtonPostIE
from .wat import WatIE
+from .wdr import (
+ WDRIE,
+ WDRMobileIE,
+ WDRMausIE,
+)
from .weibo import WeiboIE
from .wimp import WimpIE
from .wistia import WistiaIE
from .worldstarhiphop import WorldStarHipHopIE
+from .xbef import XBefIE
from .xhamster import XHamsterIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
-from .xtube import XTubeIE
+from .xtube import XTubeUserIE, XTubeIE
from .yahoo import (
YahooIE,
YahooNewsIE,
@@ -271,19 +358,20 @@ from .youku import YoukuIE
from .youporn import YouPornIE
from .youtube import (
YoutubeIE,
+ YoutubeChannelIE,
+ YoutubeFavouritesIE,
+ YoutubeHistoryIE,
YoutubePlaylistIE,
- YoutubeSearchIE,
+ YoutubeRecommendedIE,
YoutubeSearchDateIE,
- YoutubeUserIE,
- YoutubeChannelIE,
+ YoutubeSearchIE,
+ YoutubeSearchURLIE,
YoutubeShowIE,
YoutubeSubscriptionsIE,
- YoutubeRecommendedIE,
+ YoutubeTopListIE,
YoutubeTruncatedURLIE,
+ YoutubeUserIE,
YoutubeWatchLaterIE,
- YoutubeFavouritesIE,
- YoutubeHistoryIE,
- YoutubeTopListIE,
)
from .zdf import ZDFIE
diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py
index 72f81d0..59d3bbb 100644
--- a/youtube_dl/extractor/academicearth.py
+++ b/youtube_dl/extractor/academicearth.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class AcademicEarthCourseIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
+ _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
IE_NAME = 'AcademicEarth:Course'
def _real_extract(self, url):
@@ -14,12 +14,12 @@ class AcademicEarthCourseIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
title = self._html_search_regex(
- r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
+ r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title')
description = self._html_search_regex(
- r'<p class="excerpt">(.*?)</p>',
+ r'<p class="excerpt"[^>]*?>(.*?)</p>',
webpage, u'description', fatal=False)
urls = re.findall(
- r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
+ r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
webpage)
entries = [self.url_result(u) for u in urls]
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index a3a1b99..fcf2960 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -14,14 +16,14 @@ from ..utils import (
class AddAnimeIE(InfoExtractor):
_VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
- IE_NAME = u'AddAnime'
_TEST = {
- u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
- u'file': u'24MR3YO5SAS9.mp4',
- u'md5': u'72954ea10bc979ab5e2eb288b21425a0',
- u'info_dict': {
- u"description": u"One Piece 606",
- u"title": u"One Piece 606"
+ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+ 'md5': '72954ea10bc979ab5e2eb288b21425a0',
+ 'info_dict': {
+ 'id': '24MR3YO5SAS9',
+ 'ext': 'mp4',
+ 'description': 'One Piece 606',
+ 'title': 'One Piece 606',
}
}
@@ -38,10 +40,10 @@ class AddAnimeIE(InfoExtractor):
redir_webpage = ee.cause.read().decode('utf-8')
action = self._search_regex(
r'<form id="challenge-form" action="([^"]+)"',
- redir_webpage, u'Redirect form')
+ redir_webpage, 'Redirect form')
vc = self._search_regex(
r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
- redir_webpage, u'redirect vc value')
+ redir_webpage, 'redirect vc value')
av = re.search(
r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
redir_webpage)
@@ -52,19 +54,19 @@ class AddAnimeIE(InfoExtractor):
parsed_url = compat_urllib_parse_urlparse(url)
av_val = av_res + len(parsed_url.netloc)
confirm_url = (
- parsed_url.scheme + u'://' + parsed_url.netloc +
+ parsed_url.scheme + '://' + parsed_url.netloc +
action + '?' +
compat_urllib_parse.urlencode({
'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
self._download_webpage(
confirm_url, video_id,
- note=u'Confirming after redirect')
+ note='Confirming after redirect')
webpage = self._download_webpage(url, video_id)
formats = []
for format_id in ('normal', 'hq'):
rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
- video_url = self._search_regex(rex, webpage, u'video file URLx',
+ video_url = self._search_regex(rex, webpage, 'video file URLx',
fatal=False)
if not video_url:
continue
@@ -72,14 +74,13 @@ class AddAnimeIE(InfoExtractor):
'format_id': format_id,
'url': video_url,
})
- if not formats:
- raise ExtractorError(u'Cannot find any video format!')
+ self._sort_formats(formats)
video_title = self._og_search_title(webpage)
video_description = self._og_search_description(webpage)
return {
'_type': 'video',
- 'id': video_id,
+ 'id': video_id,
'formats': formats,
'title': video_title,
'description': video_description
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
new file mode 100644
index 0000000..cfc7370
--- /dev/null
+++ b/youtube_dl/extractor/aftonbladet.py
@@ -0,0 +1,66 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class AftonbladetIE(InfoExtractor):
+ _VALID_URL = r'^http://tv\.aftonbladet\.se/webbtv.+?(?P<video_id>article[0-9]+)\.ab(?:$|[?#])'
+ _TEST = {
+ 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+ 'info_dict': {
+ 'id': 'article36015',
+ 'ext': 'mp4',
+ 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
+ 'description': 'Jupiters måne mest aktiv av alla himlakroppar',
+ 'timestamp': 1394142732,
+ 'upload_date': '20140306',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.search(self._VALID_URL, url)
+
+ video_id = mobj.group('video_id')
+ webpage = self._download_webpage(url, video_id)
+
+ # find internal video meta data
+ meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
+ internal_meta_id = self._html_search_regex(
+ r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+ internal_meta_url = meta_url % internal_meta_id
+ internal_meta_json = self._download_json(
+ internal_meta_url, video_id, 'Downloading video meta data')
+
+ # find internal video formats
+ format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s'
+ internal_video_id = internal_meta_json['videoId']
+ internal_formats_url = format_url % internal_video_id
+ internal_formats_json = self._download_json(
+ internal_formats_url, video_id, 'Downloading video formats')
+
+ formats = []
+ for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']:
+ p = fmt['paths'][0]
+ formats.append({
+ 'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']),
+ 'ext': 'mp4',
+ 'width': fmt['width'],
+ 'height': fmt['height'],
+ 'tbr': fmt['bitrate'],
+ 'protocol': 'http',
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': internal_meta_json['title'],
+ 'formats': formats,
+ 'thumbnail': internal_meta_json['imageUrl'],
+ 'description': internal_meta_json['shortPreamble'],
+ 'timestamp': internal_meta_json['timePublished'],
+ 'duration': internal_meta_json['duration'],
+ 'view_count': internal_meta_json['views'],
+ }
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
new file mode 100644
index 0000000..a7bfe5a
--- /dev/null
+++ b/youtube_dl/extractor/aol.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .fivemin import FiveMinIE
+
+
+class AolIE(InfoExtractor):
+ IE_NAME = 'on.aol.com'
+ _VALID_URL = r'''(?x)
+ (?:
+ aol-video:|
+ http://on\.aol\.com/
+ (?:
+ video/.*-|
+ playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid=
+ )
+ )
+ (?P<id>[0-9]+)
+ (?:$|\?)
+ '''
+
+ _TEST = {
+ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
+ 'md5': '18ef68f48740e86ae94b98da815eec42',
+ 'info_dict': {
+ 'id': '518167793',
+ 'ext': 'mp4',
+ 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
+ },
+ 'add_ie': ['FiveMin'],
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ playlist_id = mobj.group('playlist_id')
+ if playlist_id and not self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+ webpage = self._download_webpage(url, playlist_id)
+ title = self._html_search_regex(
+ r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
+ playlist_html = self._search_regex(
+ r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
+ 'playlist HTML')
+ entries = [{
+ '_type': 'url',
+ 'url': 'aol-video:%s' % m.group('id'),
+ 'ie_key': 'Aol',
+ } for m in re.finditer(
+ r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
+ playlist_html)]
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'display_id': mobj.group('playlist_display_id'),
+ 'title': title,
+ 'entries': entries,
+ }
+
+ return FiveMinIE._build_result(video_id)
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index 922cede..dc8657b 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -6,7 +6,6 @@ import json
from .common import InfoExtractor
from ..utils import (
compat_urlparse,
- determine_ext,
)
@@ -16,9 +15,10 @@ class AppleTrailersIE(InfoExtractor):
"url": "http://trailers.apple.com/trailers/wb/manofsteel/",
"playlist": [
{
- "file": "manofsteel-trailer4.mov",
"md5": "d97a8e575432dbcb81b7c3acb741f8a8",
"info_dict": {
+ "id": "manofsteel-trailer4",
+ "ext": "mov",
"duration": 111,
"title": "Trailer 4",
"upload_date": "20130523",
@@ -26,9 +26,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
- "file": "manofsteel-trailer3.mov",
"md5": "b8017b7131b721fb4e8d6f49e1df908c",
"info_dict": {
+ "id": "manofsteel-trailer3",
+ "ext": "mov",
"duration": 182,
"title": "Trailer 3",
"upload_date": "20130417",
@@ -36,9 +37,10 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
- "file": "manofsteel-trailer.mov",
"md5": "d0f1e1150989b9924679b441f3404d48",
"info_dict": {
+ "id": "manofsteel-trailer",
+ "ext": "mov",
"duration": 148,
"title": "Trailer",
"upload_date": "20121212",
@@ -46,15 +48,16 @@ class AppleTrailersIE(InfoExtractor):
},
},
{
- "file": "manofsteel-teaser.mov",
"md5": "5fe08795b943eb2e757fa95cb6def1cb",
"info_dict": {
+ "id": "manofsteel-teaser",
+ "ext": "mov",
"duration": 93,
"title": "Teaser",
"upload_date": "20120721",
"uploader_id": "wb",
},
- }
+ },
]
}
@@ -65,16 +68,16 @@ class AppleTrailersIE(InfoExtractor):
movie = mobj.group('movie')
uploader_id = mobj.group('company')
- playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
+ playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
- s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+ s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
# The ' in the onClick attributes are not escaped, it couldn't be parsed
# like: http://trailers.apple.com/trailers/wb/gravity/
def _clean_json(m):
- return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+ return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
s = re.sub(self._JSON_RE, _clean_json, s)
- s = u'<html>' + s + u'</html>'
+ s = '<html>' + s + u'</html>'
return s
doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
@@ -82,7 +85,7 @@ class AppleTrailersIE(InfoExtractor):
for li in doc.findall('./div/ul/li'):
on_click = li.find('.//a').attrib['onClick']
trailer_info_json = self._search_regex(self._JSON_RE,
- on_click, u'trailer info')
+ on_click, 'trailer info')
trailer_info = json.loads(trailer_info_json)
title = trailer_info['title']
video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@@ -98,8 +101,7 @@ class AppleTrailersIE(InfoExtractor):
first_url = trailer_info['url']
trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
- settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
- settings = json.loads(settings_json)
+ settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
formats = []
for format in settings['metadata']['sizes']:
@@ -107,7 +109,6 @@ class AppleTrailersIE(InfoExtractor):
format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
formats.append({
'url': format_url,
- 'ext': determine_ext(format_url),
'format': format['type'],
'width': format['width'],
'height': int(format['height']),
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index b88f71b..c6d22c0 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -38,15 +38,19 @@ class ARDIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(
- r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title')
+ [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
+ r'<meta name="dcterms.title" content="(.*?)"/>',
+ r'<h4 class="headline">(.*?)</h4>'],
+ webpage, 'title')
description = self._html_search_meta(
'dcterms.abstract', webpage, 'description')
thumbnail = self._og_search_thumbnail(webpage)
- streams = [
- mo.groupdict()
- for mo in re.finditer(
- r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)]
+
+ media_info = self._download_json(
+ 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
+ # The second element of the _mediaArray contains the standard http urls
+ streams = media_info['_mediaArray'][1]['_mediaStreamArray']
if not streams:
if '"fsk"' in webpage:
raise ExtractorError('This video is only available after 20:00')
@@ -54,21 +58,12 @@ class ARDIE(InfoExtractor):
formats = []
for s in streams:
format = {
- 'quality': int(s['quality']),
+ 'quality': s['_quality'],
+ 'url': s['_stream'],
}
- if s.get('rtmp_url'):
- format['protocol'] = 'rtmp'
- format['url'] = s['rtmp_url']
- format['playpath'] = s['video_url']
- else:
- format['url'] = s['video_url']
-
- quality_name = self._search_regex(
- r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'],
- 'quality name', default='NA')
- format['format_id'] = '%s-%s-%s-%s' % (
- determine_ext(format['url']), quality_name, s['media_type'],
- s['quality'])
+
+ format['format_id'] = '%s-%s' % (
+ determine_ext(format['url']), format['quality'])
formats.append(format)
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 7cf3785..b528a9e 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
@@ -19,115 +18,46 @@ from ..utils import (
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
-class ArteTvIE(InfoExtractor):
- _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
- _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
- _LIVE_URL = r'index-[0-9]+\.html$'
+class ArteTvIE(InfoExtractor):
+ _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
IE_NAME = 'arte.tv'
- @classmethod
- def suitable(cls, url):
- return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
-
- # TODO implement Live Stream
- # from ..utils import compat_urllib_parse
- # def extractLiveStream(self, url):
- # video_lang = url.split('/')[-4]
- # info = self.grep_webpage(
- # url,
- # r'src="(.*?/videothek_js.*?\.js)',
- # 0,
- # [
- # (1, 'url', 'Invalid URL: %s' % url)
- # ]
- # )
- # http_host = url.split('/')[2]
- # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
- # info = self.grep_webpage(
- # next_url,
- # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
- # '(http://.*?\.swf).*?' +
- # '(rtmp://.*?)\'',
- # re.DOTALL,
- # [
- # (1, 'path', 'could not extract video path: %s' % url),
- # (2, 'player', 'could not extract video player: %s' % url),
- # (3, 'url', 'could not extract video url: %s' % url)
- # ]
- # )
- # video_url = '%s/%s' % (info.get('url'), info.get('path'))
-
def _real_extract(self, url):
- mobj = re.match(self._VIDEOS_URL, url)
- if mobj is not None:
- id = mobj.group('id')
- lang = mobj.group('lang')
- return self._extract_video(url, id, lang)
-
- mobj = re.match(self._LIVEWEB_URL, url)
- if mobj is not None:
- name = mobj.group('name')
- lang = mobj.group('lang')
- return self._extract_liveweb(url, name, lang)
-
- if re.search(self._LIVE_URL, url) is not None:
- raise ExtractorError(u'Arte live streams are not yet supported, sorry')
- # self.extractLiveStream(url)
- # return
-
- def _extract_video(self, url, video_id, lang):
- """Extract from videos.arte.tv"""
+ mobj = re.match(self._VALID_URL, url)
+ lang = mobj.group('lang')
+ video_id = mobj.group('id')
+
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
- ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
+ ref_xml_doc = self._download_xml(
+ ref_xml_url, video_id, note='Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
- config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
-
- video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
- def _key(m):
- quality = m.group('quality')
- if quality == 'hd':
- return 2
- else:
- return 1
- # We pick the best quality
- video_urls = sorted(video_urls, key=_key)
- video_url = list(video_urls)[-1].group('url')
-
- title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
- thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
- config_xml, 'thumbnail')
- return {'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'url': video_url,
- 'ext': 'flv',
- }
-
- def _extract_liveweb(self, url, name, lang):
- """Extract form http://liveweb.arte.tv/"""
- webpage = self._download_webpage(url, name)
- video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id')
- config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
- video_id, 'Downloading information')
- event_doc = config_doc.find('event')
- url_node = event_doc.find('video').find('urlHd')
- if url_node is None:
- url_node = event_doc.find('urlSd')
-
- return {'id': video_id,
- 'title': event_doc.find('name%s' % lang.capitalize()).text,
- 'url': url_node.text.replace('MP4', 'mp4'),
- 'ext': 'flv',
- 'thumbnail': self._og_search_thumbnail(webpage),
- }
+ config = self._download_xml(
+ config_xml_url, video_id, note='Downloading configuration')
+
+ formats = [{
+ 'forma_id': q.attrib['quality'],
+ 'url': q.text,
+ 'ext': 'flv',
+ 'quality': 2 if q.attrib['quality'] == 'hd' else 1,
+ } for q in config.findall('./urls/url')]
+ self._sort_formats(formats)
+
+ title = config.find('.//name').text
+ thumbnail = config.find('.//firstThumbnailUrl').text
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
class ArteTVPlus7IE(InfoExtractor):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
@classmethod
def _extract_url_info(cls, url):
@@ -144,13 +74,12 @@ class ArteTVPlus7IE(InfoExtractor):
return self._extract_from_webpage(webpage, video_id, lang)
def _extract_from_webpage(self, webpage, video_id, lang):
- json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+ json_url = self._html_search_regex(
+ r'arte_vp_url="(.*?)"', webpage, 'json vp url')
return self._extract_from_json_url(json_url, video_id, lang)
def _extract_from_json_url(self, json_url, video_id, lang):
- json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
- self.report_extraction(video_id)
- info = json.loads(json_info)
+ info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
info_dict = {
@@ -172,6 +101,8 @@ class ArteTVPlus7IE(InfoExtractor):
l = 'F'
elif lang == 'de':
l = 'A'
+ else:
+ l = lang
regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
@@ -190,14 +121,19 @@ class ArteTVPlus7IE(InfoExtractor):
return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
else:
def sort_key(f):
+ versionCode = f.get('versionCode')
+ if versionCode is None:
+ versionCode = ''
return (
# Sort first by quality
- int(f.get('height',-1)),
- int(f.get('bitrate',-1)),
+ int(f.get('height', -1)),
+ int(f.get('bitrate', -1)),
# The original version with subtitles has lower relevance
- re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None,
+ re.match(r'VO-ST(F|A)', versionCode) is None,
# The version with sourds/mal subtitles has also lower relevance
- re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None,
+ re.match(r'VO?(F|A)-STM\1', versionCode) is None,
+ # Prefer http downloads over m3u8
+ 0 if f['url'].endswith('m3u8') else 1,
)
formats = sorted(formats, key=sort_key)
def _format(format_info):
@@ -238,8 +174,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
- 'file': '050489-002.mp4',
'info_dict': {
+ 'id': '050489-002',
+ 'ext': 'mp4',
'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design',
},
}
@@ -251,8 +188,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
_TEST = {
'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
- 'file': '050940-003.mp4',
'info_dict': {
+ 'id': '050940-003',
+ 'ext': 'mp4',
'title': 'Les champignons au secours de la planète',
},
}
@@ -266,7 +204,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
class ArteTVDDCIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:ddc'
- _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+ _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
def _real_extract(self, url):
video_id, lang = self._extract_url_info(url)
@@ -280,3 +218,39 @@ class ArteTVDDCIE(ArteTVPlus7IE):
javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVConcertIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:concert'
+ _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)'
+
+ _TEST = {
+ 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
+ 'md5': '9ea035b7bd69696b67aa2ccaaa218161',
+ 'info_dict': {
+ 'id': '186',
+ 'ext': 'mp4',
+ 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"',
+ 'upload_date': '20140128',
+ 'description': 'md5:486eb08f991552ade77439fe6d82c305',
+ },
+ }
+
+
+class ArteTVEmbedIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:embed'
+ _VALID_URL = r'''(?x)
+ http://www\.arte\.tv
+ /playerv2/embed\.php\?json_url=
+ (?P<json_url>
+ http://arte\.tv/papi/tvguide/videos/stream/player/
+ (?P<lang>[^/]+)/(?P<id>[^/]+)[^&]*
+ )
+ '''
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ lang = mobj.group('lang')
+ json_url = mobj.group('json_url')
+ return self._extract_from_json_url(json_url, video_id, lang)
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py
index c6f30e6..20bf125 100644
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -11,22 +11,24 @@ from ..utils import (
class AUEngineIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?auengine\.com/embed\.php\?.*?file=(?P<id>[^&]+).*?'
+
_TEST = {
'url': 'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
- 'file': 'lfvlytY6.mp4',
'md5': '48972bdbcf1a3a2f5533e62425b41d4f',
'info_dict': {
+ 'id': 'lfvlytY6',
+ 'ext': 'mp4',
'title': '[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]'
}
}
- _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
+
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
- webpage, 'title')
+ title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title')
title = title.strip()
links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
links = map(compat_urllib_parse.unquote, links)
@@ -39,14 +41,15 @@ class AUEngineIE(InfoExtractor):
elif '/videos/' in link:
video_url = link
if not video_url:
- raise ExtractorError(u'Could not find video URL')
+ raise ExtractorError('Could not find video URL')
ext = '.' + determine_ext(video_url)
if ext == title[-len(ext):]:
title = title[:-len(ext)]
return {
- 'id': video_id,
- 'url': video_url,
- 'title': title,
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
'thumbnail': thumbnail,
+ 'http_referer': 'http://www.auengine.com/flowplayer/flowplayer.commercial-3.2.14.swf',
}
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 886b0df..dcbbdef 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -12,14 +12,14 @@ from ..utils import (
class BandcampIE(InfoExtractor):
- _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
+ _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'file': '1812978515.mp3',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
"title": "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
- "duration": 10,
+ "duration": 9.8485,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}]
@@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
- # We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
- if m_download is None:
+ if not m_download:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
if m_trackinfo:
json_code = m_trackinfo.group(1)
- data = json.loads(json_code)
- d = data[0]
+ data = json.loads(json_code)[0]
- duration = int(round(d['duration']))
formats = []
- for format_id, format_url in d['file'].items():
- ext, _, abr_str = format_id.partition('-')
-
+ for format_id, format_url in data['file'].items():
+ ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': format_url,
- 'ext': format_id.partition('-')[0],
+ 'ext': ext,
'vcodec': 'none',
- 'acodec': format_id.partition('-')[0],
- 'abr': int(format_id.partition('-')[2]),
+ 'acodec': ext,
+ 'abr': int(abr_str),
})
self._sort_formats(formats)
return {
- 'id': compat_str(d['id']),
- 'title': d['title'],
+ 'id': compat_str(data['id']),
+ 'title': data['title'],
'formats': formats,
- 'duration': duration,
+ 'duration': float(data['duration']),
}
else:
raise ExtractorError('No free songs found')
@@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor):
r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
webpage, re.MULTILINE | re.DOTALL).group('id')
- download_webpage = self._download_webpage(download_link, video_id,
- 'Downloading free downloads page')
- # We get the dictionary of the track from some javascrip code
- info = re.search(r'items: (.*?),$',
- download_webpage, re.MULTILINE).group(1)
+ download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
+ # We get the dictionary of the track from some javascript code
+ info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1)
info = json.loads(info)[0]
# We pick mp3-320 for now, until format selection can be easily implemented.
mp3_info = info['downloads']['mp3-320']
@@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor):
class BandcampAlbumIE(InfoExtractor):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'
_TEST = {
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -123,13 +117,15 @@ class BandcampAlbumIE(InfoExtractor):
'params': {
'playlistend': 2
},
- 'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+ 'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('subdomain')
title = mobj.group('title')
- webpage = self._download_webpage(url, title)
+ display_id = title or playlist_id
+ webpage = self._download_webpage(url, display_id)
tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
if not tracks_paths:
raise ExtractorError('The page doesn\'t contain any tracks')
@@ -139,6 +135,8 @@ class BandcampAlbumIE(InfoExtractor):
title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title')
return {
'_type': 'playlist',
+ 'id': playlist_id,
+ 'display_id': display_id,
'title': title,
'entries': entries,
}
diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py
index 6d785c0..75e608f 100644
--- a/youtube_dl/extractor/bbccouk.py
+++ b/youtube_dl/extractor/bbccouk.py
@@ -13,13 +13,13 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
_TESTS = [
{
- 'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
+ 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
'info_dict': {
- 'id': 'p01q7wz4',
+ 'id': 'b039d07m',
'ext': 'flv',
- 'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
- 'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
- 'duration': 1936,
+ 'title': 'Kaleidoscope: Leonard Cohen',
+ 'description': 'md5:db4755d7a665ae72343779f7dacb402c',
+ 'duration': 1740,
},
'params': {
# rtmp download
@@ -38,7 +38,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
'params': {
# rtmp download
'skip_download': True,
- }
+ },
+ 'skip': 'Episode is no longer available on BBC iPlayer Radio',
},
{
'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
@@ -161,6 +162,11 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
mobj = re.match(self._VALID_URL, url)
group_id = mobj.group('id')
+ webpage = self._download_webpage(url, group_id, 'Downloading video page')
+ if re.search(r'id="emp-error" class="notinuk">', webpage):
+ raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only',
+ expected=True)
+
playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
'Downloading playlist XML')
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
new file mode 100644
index 0000000..45067b9
--- /dev/null
+++ b/youtube_dl/extractor/bilibili.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+ ExtractorError,
+ int_or_none,
+ unified_strdate,
+)
+
+
+class BiliBiliIE(InfoExtractor):
+ _VALID_URL = r'http://www\.bilibili\.tv/video/av(?P<id>[0-9]+)/'
+
+ _TEST = {
+ 'url': 'http://www.bilibili.tv/video/av1074402/',
+ 'md5': '2c301e4dab317596e837c3e7633e7d86',
+ 'info_dict': {
+ 'id': '1074402',
+ 'ext': 'flv',
+ 'title': '【金坷垃】金泡沫',
+ 'duration': 308,
+ 'upload_date': '20140420',
+ 'thumbnail': 're:^https?://.+\.jpg',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ video_code = self._search_regex(
+ r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
+
+ title = self._html_search_meta(
+ 'media:title', video_code, 'title', fatal=True)
+ duration_str = self._html_search_meta(
+ 'duration', video_code, 'duration')
+ if duration_str is None:
+ duration = None
+ else:
+ duration_mobj = re.match(
+ r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$',
+ duration_str)
+ duration = (
+ int_or_none(duration_mobj.group('hours'), default=0) * 3600 +
+ int(duration_mobj.group('minutes')) * 60 +
+ int(duration_mobj.group('seconds')))
+ upload_date = unified_strdate(self._html_search_meta(
+ 'uploadDate', video_code, fatal=False))
+ thumbnail = self._html_search_meta(
+ 'thumbnailUrl', video_code, 'thumbnail', fatal=False)
+
+ player_params = compat_parse_qs(self._html_search_regex(
+ r'<iframe .*?class="player" src="https://secure.bilibili.tv/secure,([^"]+)"',
+ webpage, 'player params'))
+
+ if 'cid' in player_params:
+ cid = player_params['cid'][0]
+
+ lq_doc = self._download_xml(
+ 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid,
+ video_id,
+ note='Downloading LQ video info'
+ )
+ lq_durl = lq_doc.find('.//durl')
+ formats = [{
+ 'format_id': 'lq',
+ 'quality': 1,
+ 'url': lq_durl.find('./url').text,
+ 'filesize': int_or_none(
+ lq_durl.find('./size'), get_attr='text'),
+ }]
+
+ hq_doc = self._download_xml(
+ 'http://interface.bilibili.cn/playurl?cid=%s' % cid,
+ video_id,
+ note='Downloading HQ video info',
+ fatal=False,
+ )
+ if hq_doc is not False:
+ hq_durl = hq_doc.find('.//durl')
+ formats.append({
+ 'format_id': 'hq',
+ 'quality': 2,
+ 'ext': 'flv',
+ 'url': hq_durl.find('./url').text,
+ 'filesize': int_or_none(
+ hq_durl.find('./size'), get_attr='text'),
+ })
+ else:
+ raise ExtractorError('Unsupported player parameters: %r' % (player_params,))
+
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
index 96408e4..38ccd95 100644
--- a/youtube_dl/extractor/blinkx.py
+++ b/youtube_dl/extractor/blinkx.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import datetime
import json
import re
@@ -19,15 +18,16 @@ class BlinkxIE(InfoExtractor):
'file': '8aQUy7GV.mp4',
'md5': '2e9a07364af40163a908edbf10bb2492',
'info_dict': {
- "title": "Police Car Rolls Away",
- "uploader": "stupidvideos.com",
- "upload_date": "20131215",
- "description": "A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!",
- "duration": 14.886,
- "thumbnails": [{
- "width": 100,
- "height": 76,
- "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg",
+ 'title': 'Police Car Rolls Away',
+ 'uploader': 'stupidvideos.com',
+ 'upload_date': '20131215',
+ 'timestamp': 1387068000,
+ 'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!',
+ 'duration': 14.886,
+ 'thumbnails': [{
+ 'width': 100,
+ 'height': 76,
+ 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',
}],
},
}
@@ -41,9 +41,6 @@ class BlinkxIE(InfoExtractor):
'video=%s' % video_id)
data_json = self._download_webpage(api_url, display_id)
data = json.loads(data_json)['api']['results'][0]
- dt = datetime.datetime.fromtimestamp(data['pubdate_epoch'])
- pload_date = dt.strftime('%Y%m%d')
-
duration = None
thumbnails = []
formats = []
@@ -64,10 +61,7 @@ class BlinkxIE(InfoExtractor):
vcodec = remove_start(m['vcodec'], 'ff')
acodec = remove_start(m['acodec'], 'ff')
tbr = (int(m['vbr']) + int(m['abr'])) // 1000
- format_id = (u'%s-%sk-%s' %
- (vcodec,
- tbr,
- m['w']))
+ format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w'])
formats.append({
'format_id': format_id,
'url': m['link'],
@@ -88,7 +82,7 @@ class BlinkxIE(InfoExtractor):
'title': data['title'],
'formats': formats,
'uploader': data['channel_name'],
- 'upload_date': pload_date,
+ 'timestamp': data['pubdate_epoch'],
'description': data.get('description'),
'thumbnails': thumbnails,
'duration': duration,
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py
index a26001b..d4da089 100644
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -1,102 +1,124 @@
from __future__ import unicode_literals
-import datetime
import re
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
- compat_str,
compat_urllib_request,
-
unescapeHTML,
+ parse_iso8601,
+ compat_urlparse,
+ clean_html,
+ compat_str,
)
class BlipTVIE(SubtitlesInfoExtractor):
- """Information extractor for blip.tv"""
-
- _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(?P<presumptive_id>.+)$'
-
- _TESTS = [{
- 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
- 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
- 'info_dict': {
- 'id': '5779306',
- 'ext': 'mov',
- 'upload_date': '20111205',
- 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
- 'uploader': 'Comic Book Resources - CBR TV',
- 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
- }
- }, {
- # https://github.com/rg3/youtube-dl/pull/2274
- 'note': 'Video with subtitles',
- 'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
- 'md5': '309f9d25b820b086ca163ffac8031806',
- 'info_dict': {
- 'id': '6586561',
- 'ext': 'mp4',
- 'uploader': 'Red vs. Blue',
- 'description': 'One-Zero-One',
- 'upload_date': '20130614',
- 'title': 'Red vs. Blue Season 11 Episode 1',
+ _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z]+)))'
+
+ _TESTS = [
+ {
+ 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
+ 'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+ 'info_dict': {
+ 'id': '5779306',
+ 'ext': 'mov',
+ 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
+ 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
+ 'timestamp': 1323138843,
+ 'upload_date': '20111206',
+ 'uploader': 'cbr',
+ 'uploader_id': '679425',
+ 'duration': 81,
+ }
+ },
+ {
+ # https://github.com/rg3/youtube-dl/pull/2274
+ 'note': 'Video with subtitles',
+ 'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
+ 'md5': '309f9d25b820b086ca163ffac8031806',
+ 'info_dict': {
+ 'id': '6586561',
+ 'ext': 'mp4',
+ 'title': 'Red vs. Blue Season 11 Episode 1',
+ 'description': 'One-Zero-One',
+ 'timestamp': 1371261608,
+ 'upload_date': '20130615',
+ 'uploader': 'redvsblue',
+ 'uploader_id': '792887',
+ 'duration': 279,
+ }
}
- }]
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- presumptive_id = mobj.group('presumptive_id')
+ lookup_id = mobj.group('lookup_id')
# See https://github.com/rg3/youtube-dl/issues/857
- embed_mobj = re.match(r'https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url)
- if embed_mobj:
- info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1)
- info_page = self._download_webpage(info_url, embed_mobj.group(1))
- video_id = self._search_regex(
- r'data-episode-id="([0-9]+)', info_page, 'video_id')
- return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV')
-
- cchar = '&' if '?' in url else '?'
- json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
- request = compat_urllib_request.Request(json_url)
- request.add_header('User-Agent', 'iTunes/10.6.1')
-
- json_data = self._download_json(request, video_id=presumptive_id)
-
- if 'Post' in json_data:
- data = json_data['Post']
+ if lookup_id:
+ info_page = self._download_webpage(
+ 'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id')
+ video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id')
else:
- data = json_data
+ video_id = mobj.group('id')
+
+ rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
+
+ def blip(s):
+ return '{http://blip.tv/dtd/blip/1.0}%s' % s
+
+ def media(s):
+ return '{http://search.yahoo.com/mrss/}%s' % s
+
+ def itunes(s):
+ return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
+
+ item = rss.find('channel/item')
+
+ video_id = item.find(blip('item_id')).text
+ title = item.find('./title').text
+ description = clean_html(compat_str(item.find(blip('puredescription')).text))
+ timestamp = parse_iso8601(item.find(blip('datestamp')).text)
+ uploader = item.find(blip('user')).text
+ uploader_id = item.find(blip('userid')).text
+ duration = int(item.find(blip('runtime')).text)
+ media_thumbnail = item.find(media('thumbnail'))
+ thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
+ categories = [category.text for category in item.findall('category')]
- video_id = compat_str(data['item_id'])
- upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
- subtitles = {}
formats = []
- if 'additionalMedia' in data:
- for f in data['additionalMedia']:
- if f.get('file_type_srt') == 1:
- LANGS = {
- 'english': 'en',
- }
- lang = f['role'].rpartition('-')[-1].strip().lower()
- langcode = LANGS.get(lang, lang)
- subtitles[langcode] = f['url']
- continue
- if not int(f['media_width']): # filter m3u8
- continue
+ subtitles = {}
+
+ media_group = item.find(media('group'))
+ for media_content in media_group.findall(media('content')):
+ url = media_content.get('url')
+ role = media_content.get(blip('role'))
+ msg = self._download_webpage(
+ url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
+ video_id, 'Resolving URL for %s' % role)
+ real_url = compat_urlparse.parse_qs(msg)['message'][0]
+
+ media_type = media_content.get('type')
+ if media_type == 'text/srt' or url.endswith('.srt'):
+ LANGS = {
+ 'english': 'en',
+ }
+ lang = role.rpartition('-')[-1].strip().lower()
+ langcode = LANGS.get(lang, lang)
+ subtitles[langcode] = url
+ elif media_type.startswith('video/'):
formats.append({
- 'url': f['url'],
- 'format_id': f['role'],
- 'width': int(f['media_width']),
- 'height': int(f['media_height']),
+ 'url': real_url,
+ 'format_id': role,
+ 'format_note': media_type,
+ 'vcodec': media_content.get(blip('vcodec')),
+ 'acodec': media_content.get(blip('acodec')),
+ 'filesize': media_content.get('filesize'),
+ 'width': int(media_content.get('width')),
+ 'height': int(media_content.get('height')),
})
- else:
- formats.append({
- 'url': data['media']['url'],
- 'width': int(data['media']['width']),
- 'height': int(data['media']['height']),
- })
self._sort_formats(formats)
# subtitles
@@ -107,12 +129,14 @@ class BlipTVIE(SubtitlesInfoExtractor):
return {
'id': video_id,
- 'uploader': data['display_name'],
- 'upload_date': upload_date,
- 'title': data['title'],
- 'thumbnail': data['thumbnailUrl'],
- 'description': data['description'],
- 'user_agent': 'iTunes/10.6.1',
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
'formats': formats,
'subtitles': video_subtitles,
}
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py
index 2415ce4..25fb79e 100644
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -1,22 +1,21 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
-from .ooyala import OoyalaIE
class BloombergIE(InfoExtractor):
_VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
_TEST = {
- u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
- u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4',
- u'info_dict': {
- u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies',
- u'description': u'md5:abc86e5236f9f0e4866c59ad36736686',
- },
- u'params': {
- # Requires ffmpeg (m3u8 manifest)
- u'skip_download': True,
+ 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
+ 'md5': '7bf08858ff7c203c870e8a6190e221e5',
+ 'info_dict': {
+ 'id': 'qurhIVlJSB6hzkVi229d8g',
+ 'ext': 'flv',
+ 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
+ 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88',
},
}
@@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
name = mobj.group('name')
webpage = self._download_webpage(url, name)
- embed_code = self._search_regex(
- r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
- 'embed code')
- return OoyalaIE._build_url_result(embed_code)
+ f4m_url = self._search_regex(
+ r'<source src="(https?://[^"]+\.f4m.*?)"', webpage,
+ 'f4m url')
+ title = re.sub(': Video$', '', self._og_search_title(webpage))
+
+ return {
+ 'id': name.split('-')[-1],
+ 'title': title,
+ 'url': f4m_url,
+ 'ext': 'flv',
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
new file mode 100644
index 0000000..b5b56ff
--- /dev/null
+++ b/youtube_dl/extractor/br.py
@@ -0,0 +1,139 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class BRIE(InfoExtractor):
+ IE_DESC = 'Bayerischer Rundfunk Mediathek'
+ _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-]+/)+(?P<id>[a-z0-9\-]+)\.html'
+ _BASE_URL = 'http://www.br.de'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.br.de/mediathek/video/anselm-gruen-114.html',
+ 'md5': 'c4f83cf0f023ba5875aba0bf46860df2',
+ 'info_dict': {
+ 'id': '2c8d81c5-6fb7-4a74-88d4-e768e5856532',
+ 'ext': 'mp4',
+ 'title': 'Feiern und Verzichten',
+ 'description': 'Anselm Grün: Feiern und Verzichten',
+ 'uploader': 'BR/Birgit Baier',
+ 'upload_date': '20140301',
+ }
+ },
+ {
+ 'url': 'http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html',
+ 'md5': 'ab451b09d861dbed7d7cc9ab0be19ebe',
+ 'info_dict': {
+ 'id': '2c060e69-3a27-4e13-b0f0-668fac17d812',
+ 'ext': 'mp4',
+ 'title': 'Über den Pass',
+ 'description': 'Die Eroberung der Alpen: Über den Pass',
+ }
+ },
+ {
+ 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
+ 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
+ 'info_dict': {
+ 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab',
+ 'ext': 'aac',
+ 'title': '"Keine neuen Schulden im nächsten Jahr"',
+ 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"',
+ }
+ },
+ {
+ 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
+ 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a',
+ 'info_dict': {
+ 'id': '6ba73750-d405-45d3-861d-1ce8c524e059',
+ 'ext': 'mp4',
+ 'title': 'Umweltbewusster Häuslebauer',
+ 'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer',
+ }
+ },
+ {
+ 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html',
+ 'md5': '23bca295f1650d698f94fc570977dae3',
+ 'info_dict': {
+ 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',
+ 'ext': 'mp4',
+ 'title': 'Folge 1 - Metaphysik',
+ 'description': 'Kant für Anfänger: Folge 1 - Metaphysik',
+ 'uploader': 'Eva Maria Steimle',
+ 'upload_date': '20140117',
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ page = self._download_webpage(url, display_id)
+ xml_url = self._search_regex(
+ r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
+ xml = self._download_xml(self._BASE_URL + xml_url, None)
+
+ medias = []
+
+ for xml_media in xml.findall('video') + xml.findall('audio'):
+ media = {
+ 'id': xml_media.get('externalId'),
+ 'title': xml_media.find('title').text,
+ 'formats': self._extract_formats(xml_media.find('assets')),
+ 'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')),
+ 'description': ' '.join(xml_media.find('shareTitle').text.splitlines()),
+ 'webpage_url': xml_media.find('permalink').text
+ }
+ if xml_media.find('author').text:
+ media['uploader'] = xml_media.find('author').text
+ if xml_media.find('broadcastDate').text:
+ media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.')))
+ medias.append(media)
+
+ if len(medias) > 1:
+ self._downloader.report_warning(
+ 'found multiple medias; please '
+ 'report this with the video URL to http://yt-dl.org/bug')
+ if not medias:
+ raise ExtractorError('No media entries found')
+ return medias[0]
+
+ def _extract_formats(self, assets):
+
+ def text_or_none(asset, tag):
+ elem = asset.find(tag)
+ return None if elem is None else elem.text
+
+ formats = [{
+ 'url': text_or_none(asset, 'downloadUrl'),
+ 'ext': text_or_none(asset, 'mediaType'),
+ 'format_id': asset.get('type'),
+ 'width': int_or_none(text_or_none(asset, 'frameWidth')),
+ 'height': int_or_none(text_or_none(asset, 'frameHeight')),
+ 'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')),
+ 'abr': int_or_none(text_or_none(asset, 'bitrateAudio')),
+ 'vcodec': text_or_none(asset, 'codecVideo'),
+ 'acodec': text_or_none(asset, 'codecAudio'),
+ 'container': text_or_none(asset, 'mediaType'),
+ 'filesize': int_or_none(text_or_none(asset, 'size')),
+ } for asset in assets.findall('asset')
+ if asset.find('downloadUrl') is not None]
+
+ self._sort_formats(formats)
+ return formats
+
+ def _extract_thumbnails(self, variants):
+ thumbnails = [{
+ 'url': self._BASE_URL + variant.find('url').text,
+ 'width': int_or_none(variant.find('width').text),
+ 'height': int_or_none(variant.find('height').text),
+ } for variant in variants.findall('variant')]
+ thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)
+ return thumbnails
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index 8ec6dda..1bfc9f3 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -23,13 +23,14 @@ class BreakIE(InfoExtractor):
video_id = mobj.group(1).split("-")[-1]
embed_url = 'http://www.break.com/embed/%s' % video_id
webpage = self._download_webpage(embed_url, video_id)
- info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
- 'info json', flags=re.DOTALL)
+ info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>',
+ webpage, 'info json', flags=re.DOTALL)
info = json.loads(info_json)
video_url = info['videoUri']
- m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
- if m_youtube is not None:
- return self.url_result(m_youtube.group(1), 'Youtube')
+ youtube_id = info.get('youtubeId')
+ if youtube_id:
+ return self.url_result(youtube_id, 'Youtube')
+
final_url = video_url + '?' + info['AuthToken']
return {
'id': video_id,
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 83eec84..3c02c29 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor):
object_str = object_str.replace('<--', '<!--')
object_str = fix_xml_ampersands(object_str)
- object_doc = xml.etree.ElementTree.fromstring(object_str)
+ object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
if fv_el is not None:
@@ -140,7 +140,11 @@ class BrightcoveIE(InfoExtractor):
url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
if url_m:
- return [unescapeHTML(url_m.group(1))]
+ url = unescapeHTML(url_m.group(1))
+ # Some sites don't add it, we can't download with this url, for example:
+ # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
+ if 'playerKey' in url:
+ return [url]
matches = re.findall(
r'''(?sx)<object
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
new file mode 100644
index 0000000..cf19b7b
--- /dev/null
+++ b/youtube_dl/extractor/byutv.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class BYUtvIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
+ 'info_dict': {
+ 'id': 'granite-flats-talking',
+ 'ext': 'mp4',
+ 'description': 'md5:4e9a7ce60f209a33eca0ac65b4918e1c',
+ 'title': 'Talking',
+ 'thumbnail': 're:^https?://.*promo.*'
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+
+ webpage = self._download_webpage(url, video_id)
+ episode_code = self._search_regex(
+ r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information')
+ episode_json = re.sub(
+ r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code)
+ ep = json.loads(episode_json)
+
+ if ep['providerType'] == 'Ooyala':
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'Ooyala',
+ 'url': 'ooyala:%s' % ep['providerId'],
+ 'id': video_id,
+ 'title': ep['title'],
+ 'description': ep.get('description'),
+ 'thumbnail': ep.get('imageThumbnail'),
+ }
+ else:
+ raise ExtractorError('Unsupported provider %s' % ep['provider'])
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
index 690bc7c..cb96c38 100644
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@@ -2,39 +2,46 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
class C56IE(InfoExtractor):
- _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)'
+ _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
_TEST = {
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
- 'file': '93440716.flv',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
+ 'id': '93440716',
+ 'ext': 'flv',
'title': '网事知多少 第32期:车怒',
+ 'duration': 283.813,
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid')
- info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
- text_id, 'Downloading video info')
- info = json.loads(info_page)['info']
- formats = [{
- 'format_id': f['type'],
- 'filesize': int(f['filesize']),
- 'url': f['url']
- } for f in info['rfiles']]
+
+ page = self._download_json(
+ 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
+
+ info = page['info']
+
+ formats = [
+ {
+ 'format_id': f['type'],
+ 'filesize': int(f['filesize']),
+ 'url': f['url']
+ } for f in info['rfiles']
+ ]
self._sort_formats(formats)
return {
'id': info['vid'],
'title': info['Subject'],
+ 'duration': int(info['duration']) / 1000.0,
'formats': formats,
'thumbnail': info.get('bimg') or info.get('img'),
}
diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py
new file mode 100644
index 0000000..93241fe
--- /dev/null
+++ b/youtube_dl/extractor/canal13cl.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class Canal13clIE(InfoExtractor):
+ _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+ 'md5': '4cb1fa38adcad8fea88487a078831755',
+ 'info_dict': {
+ 'id': '1403022125',
+ 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+ 'ext': 'mp4',
+ 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda',
+ 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._html_search_meta(
+ 'twitter:title', webpage, 'title', fatal=True)
+ description = self._html_search_meta(
+ 'twitter:description', webpage, 'description')
+ url = self._html_search_regex(
+ r'articuloVideo = \"(.*?)\"', webpage, 'url')
+ real_id = self._search_regex(
+ r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id)
+ thumbnail = self._html_search_regex(
+ r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail')
+
+ return {
+ 'id': real_id,
+ 'display_id': display_id,
+ 'url': url,
+ 'title': title,
+ 'description': description,
+ 'ext': 'mp4',
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index 3d8d7f9..c4fefef 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -1,4 +1,6 @@
# coding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -9,11 +11,12 @@ class Canalc2IE(InfoExtractor):
_VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
- u'file': u'12163.mp4',
- u'md5': u'060158428b650f896c542dfbb3d6487f',
- u'info_dict': {
- u'title': u'Terrasses du Numérique'
+ 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+ 'md5': '060158428b650f896c542dfbb3d6487f',
+ 'info_dict': {
+ 'id': '12163',
+ 'ext': 'mp4',
+ 'title': 'Terrasses du Numérique'
}
}
@@ -28,10 +31,11 @@ class Canalc2IE(InfoExtractor):
video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
title = self._html_search_regex(
- r'class="evenement8">(.*?)</a>', webpage, u'title')
-
- return {'id': video_id,
- 'ext': 'mp4',
- 'url': video_url,
- 'title': title,
- }
+ r'class="evenement8">(.*?)</a>', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'ext': 'mp4',
+ 'url': video_url,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 7cdcd83..0202078 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -1,53 +1,72 @@
# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+ unified_strdate,
+ url_basename,
+)
class CanalplusIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
+ _VALID_URL = r'https?://(?:www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
- IE_NAME = u'canalplus.fr'
+ IE_NAME = 'canalplus.fr'
_TEST = {
- u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
- u'file': u'922470.flv',
- u'info_dict': {
- u'title': u'Zapping - 26/08/13',
- u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
- u'upload_date': u'20130826',
- },
- u'params': {
- u'skip_download': True,
+ 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
+ 'md5': '3db39fb48b9685438ecf33a1078023e4',
+ 'info_dict': {
+ 'id': '922470',
+ 'ext': 'flv',
+ 'title': 'Zapping - 26/08/13',
+ 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
+ 'upload_date': '20130826',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.groupdict().get('id')
+
+ # Beware, some subclasses do not define an id group
+ display_id = url_basename(mobj.group('path'))
+
if video_id is None:
- webpage = self._download_webpage(url, mobj.group('path'))
- video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, 'video id')
+
info_url = self._VIDEO_INFO_TEMPLATE % video_id
- doc = self._download_xml(info_url,video_id,
- u'Downloading video info')
+ doc = self._download_xml(info_url, video_id, 'Downloading video XML')
- self.report_extraction(video_id)
video_info = [video for video in doc if video.find('ID').text == video_id][0]
- infos = video_info.find('INFOS')
media = video_info.find('MEDIA')
- formats = [media.find('VIDEOS/%s' % format)
- for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']]
- video_url = [format.text for format in formats if format is not None][-1]
-
- return {'id': video_id,
- 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text,
- infos.find('TITRAGE/SOUS_TITRE').text),
- 'url': video_url,
- 'ext': 'flv',
- 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
- 'thumbnail': media.find('IMAGES/GRAND').text,
- 'description': infos.find('DESCRIPTION').text,
- 'view_count': int(infos.find('NB_VUES').text),
- }
+ infos = video_info.find('INFOS')
+
+ preferences = ['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS']
+
+ formats = [
+ {
+ 'url': fmt.text + '?hdcore=2.11.3' if fmt.tag == 'HDS' else fmt.text,
+ 'format_id': fmt.tag,
+ 'ext': 'mp4' if fmt.tag == 'HLS' else 'flv',
+ 'preference': preferences.index(fmt.tag) if fmt.tag in preferences else -1,
+ } for fmt in media.find('VIDEOS') if fmt.text
+ ]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text,
+ infos.find('TITRAGE/SOUS_TITRE').text),
+ 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
+ 'thumbnail': media.find('IMAGES/GRAND').text,
+ 'description': infos.find('DESCRIPTION').text,
+ 'view_count': int(infos.find('NB_VUES').text),
+ 'like_count': int(infos.find('NB_LIKES').text),
+ 'comment_count': int(infos.find('NB_COMMENTS').text),
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
new file mode 100644
index 0000000..0bce793
--- /dev/null
+++ b/youtube_dl/extractor/cbsnews.py
@@ -0,0 +1,87 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class CBSNewsIE(InfoExtractor):
+ IE_DESC = 'CBS News'
+ _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/',
+ 'info_dict': {
+ 'id': 'tesla-and-spacex-elon-musks-industrial-empire',
+ 'ext': 'flv',
+ 'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire',
+ 'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg',
+ 'duration': 791,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
+ 'info_dict': {
+ 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
+ 'ext': 'flv',
+ 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
+ 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+ 'duration': 205,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_info = json.loads(self._html_search_regex(
+ r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
+ webpage, 'video JSON info'))
+
+ item = video_info['item'] if 'item' in video_info else video_info
+ title = item.get('articleTitle') or item.get('hed')
+ duration = item.get('duration')
+ thumbnail = item.get('mediaImage') or item.get('thumbnail')
+
+ formats = []
+ for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
+ uri = item.get('media' + format_id + 'URI')
+ if not uri:
+ continue
+ fmt = {
+ 'url': uri,
+ 'format_id': format_id,
+ }
+ if uri.startswith('rtmp'):
+ fmt.update({
+ 'app': 'ondemand?auth=cbs',
+ 'play_path': 'mp4:' + uri.split('<break>')[-1],
+ 'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf',
+ 'page_url': 'http://www.cbsnews.com',
+ 'ext': 'flv',
+ })
+ elif uri.endswith('.m3u8'):
+ fmt['ext'] = 'mp4'
+ formats.append(fmt)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
new file mode 100644
index 0000000..90a3ddd
--- /dev/null
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+ compat_urllib_parse,
+ compat_urllib_parse_urlparse,
+ ExtractorError,
+)
+
+
+class CeskaTelevizeIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
+ 'info_dict': {
+ 'id': '213512120230004',
+ 'ext': 'flv',
+ 'title': 'První republika: Španělská chřipka',
+ 'duration': 3107.4,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ 'skip': 'Works only from Czech Republic.',
+ },
+ {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
+ 'info_dict': {
+ 'id': '20138143440',
+ 'ext': 'flv',
+ 'title': 'Tsatsiki, maminka a policajt',
+ 'duration': 6754.1,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ 'skip': 'Works only from Czech Republic.',
+ },
+ {
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+ 'info_dict': {
+ 'id': '14716',
+ 'ext': 'flv',
+ 'title': 'První republika: Zpěvačka z Dupárny Bobina',
+ 'duration': 90,
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
+
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
+ if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
+ raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+ typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
+ episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+
+ data = {
+ 'playlist[0][type]': typ,
+ 'playlist[0][id]': episode_id,
+ 'requestUrl': compat_urllib_parse_urlparse(url).path,
+ 'requestSource': 'iVysilani',
+ }
+
+ req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
+ data=compat_urllib_parse.urlencode(data))
+
+ req.add_header('Content-type', 'application/x-www-form-urlencoded')
+ req.add_header('x-addr', '127.0.0.1')
+ req.add_header('X-Requested-With', 'XMLHttpRequest')
+ req.add_header('Referer', url)
+
+ playlistpage = self._download_json(req, video_id)
+
+ req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
+ req.add_header('Referer', url)
+
+ playlist = self._download_xml(req, video_id)
+
+ formats = []
+ for i in playlist.find('smilRoot/body'):
+ if 'AD' not in i.attrib['id']:
+ base_url = i.attrib['base']
+ parsedurl = compat_urllib_parse_urlparse(base_url)
+ duration = i.attrib['duration']
+
+ for video in i.findall('video'):
+ if video.attrib['label'] != 'AD':
+ format_id = video.attrib['label']
+ play_path = video.attrib['src']
+ vbr = int(video.attrib['system-bitrate'])
+
+ formats.append({
+ 'format_id': format_id,
+ 'url': base_url,
+ 'vbr': vbr,
+ 'play_path': play_path,
+ 'app': parsedurl.path[1:] + '?' + parsedurl.query,
+ 'rtmp_live': True,
+ 'ext': 'flv',
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': episode_id,
+ 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
+ 'duration': float(duration),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
index f0d08ce..496271b 100644
--- a/youtube_dl/extractor/cinemassacre.py
+++ b/youtube_dl/extractor/cinemassacre.py
@@ -1,84 +1,94 @@
# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ int_or_none,
)
class CinemassacreIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
- _TESTS = [{
- u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
- u'file': u'19911.flv',
- u'info_dict': {
- u'upload_date': u'20121110',
- u'title': u'“Angry Video Game Nerd: The Movie” – Trailer',
- u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
- },
- u'params': {
- # rtmp download
- u'skip_download': True,
- },
- },
- {
- u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
- u'file': u'521be8ef82b16.flv',
- u'info_dict': {
- u'upload_date': u'20131002',
- u'title': u'The Mummy’s Hand (1940)',
- },
- u'params': {
- # rtmp download
- u'skip_download': True,
+ _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
+ _TESTS = [
+ {
+ 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+ 'md5': 'fde81fbafaee331785f58cd6c0d46190',
+ 'info_dict': {
+ 'id': '19911',
+ 'ext': 'mp4',
+ 'upload_date': '20121110',
+ 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+ 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+ },
},
- }]
+ {
+ 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+ 'md5': 'd72f10cd39eac4215048f62ab477a511',
+ 'info_dict': {
+ 'id': '521be8ef82b16',
+ 'ext': 'mp4',
+ 'upload_date': '20131002',
+ 'title': 'The Mummy’s Hand (1940)',
+ },
+ }
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
- webpage_url = u'http://' + mobj.group('url')
- webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
+ webpage = self._download_webpage(url, display_id)
video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
if not mobj:
- raise ExtractorError(u'Can\'t extract embed url and video id')
- playerdata_url = mobj.group(u'embed_url')
- video_id = mobj.group(u'video_id')
+ raise ExtractorError('Can\'t extract embed url and video id')
+ playerdata_url = mobj.group('embed_url')
+ video_id = mobj.group('video_id')
- video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
- webpage, u'title')
- video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
- webpage, u'description', flags=re.DOTALL, fatal=False)
- if len(video_description) == 0:
- video_description = None
+ video_title = self._html_search_regex(
+ r'<title>(?P<title>.+?)\|', webpage, 'title')
+ video_description = self._html_search_regex(
+ r'<div class="entry-content">(?P<description>.+?)</div>',
+ webpage, 'description', flags=re.DOTALL, fatal=False)
- playerdata = self._download_webpage(playerdata_url, video_id)
- url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url')
+ playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
+ video_thumbnail = self._search_regex(
+ r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
+ sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
+ videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url')
- sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file')
- hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file')
- video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False)
+ videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
- formats = [
- {
- 'url': url,
- 'play_path': 'mp4:' + sd_file,
- 'rtmp_live': True, # workaround
- 'ext': 'flv',
- 'format': 'sd',
- 'format_id': 'sd',
- },
- {
- 'url': url,
- 'play_path': 'mp4:' + hd_file,
- 'rtmp_live': True, # workaround
- 'ext': 'flv',
- 'format': 'hd',
- 'format_id': 'hd',
- },
- ]
+ formats = []
+ baseurl = sd_url[:sd_url.rfind('/')+1]
+ for video in videolist.findall('.//video'):
+ src = video.get('src')
+ if not src:
+ continue
+ file_ = src.partition(':')[-1]
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ bitrate = int_or_none(video.get('system-bitrate'))
+ format = {
+ 'url': baseurl + file_,
+ 'format_id': src.rpartition('.')[0].rpartition('_')[-1],
+ }
+ if width or height:
+ format.update({
+ 'tbr': bitrate // 1000 if bitrate else None,
+ 'width': width,
+ 'height': height,
+ })
+ else:
+ format.update({
+ 'abr': bitrate // 1000 if bitrate else None,
+ 'vcodec': 'none',
+ })
+ formats.append(format)
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index 43efb08..669919a 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -1,22 +1,28 @@
+from __future__ import unicode_literals
+
import re
import time
import xml.etree.ElementTree
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ parse_duration,
+)
class ClipfishIE(InfoExtractor):
- IE_NAME = u'clipfish'
+ IE_NAME = 'clipfish'
_VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
_TEST = {
- u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
- u'file': u'3966754.mp4',
- u'md5': u'2521cd644e862936cf2e698206e47385',
- u'info_dict': {
- u'title': u'FIFA 14 - E3 2013 Trailer',
- u'duration': 82,
+ 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+ 'md5': '2521cd644e862936cf2e698206e47385',
+ 'info_dict': {
+ 'id': '3966754',
+ 'ext': 'mp4',
+ 'title': 'FIFA 14 - E3 2013 Trailer',
+ 'duration': 82,
},
u'skip': 'Blocked in the US'
}
@@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor):
video_url = doc.find('filename').text
if video_url is None:
xml_bytes = xml.etree.ElementTree.tostring(doc)
- raise ExtractorError(u'Cannot find video URL in document %r' %
+ raise ExtractorError('Cannot find video URL in document %r' %
xml_bytes)
thumbnail = doc.find('imageurl').text
- duration_str = doc.find('duration').text
- m = re.match(
- r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
- duration_str)
- if m:
- duration = (
- (int(m.group('hours')) * 60 * 60) +
- (int(m.group('minutes')) * 60) +
- (int(m.group('seconds')))
- )
- else:
- duration = None
+ duration = parse_duration(doc.find('duration').text)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index 9ab6a4a..02a1667 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor):
_VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
- u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
- u'info_dict': {
- u'id': u'4629301',
- u'ext': u'mp4',
- u'title': u'Brick Briscoe',
- u'duration': 612,
+ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+ 'md5': '4d7d549451bad625e0ff3d7bd56d776c',
+ 'info_dict': {
+ 'id': '4629301',
+ 'ext': 'mp4',
+ 'title': 'Brick Briscoe',
+ 'duration': 612,
+ 'thumbnail': 're:^https?://.+\.jpg',
},
}
@@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor):
video_id = mobj.group('id')
js_player = self._download_webpage(
'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
- video_id, u'Downlaoding player')
+ video_id, 'Downlaoding player')
# it includes a required token
- flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
+ flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars')
pdoc = self._download_xml(
'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
- video_id, u'Downloading video info',
+ video_id, 'Downloading video info',
transform_source=fix_xml_ampersands)
track_doc = pdoc.find('trackList/track')
diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py
new file mode 100644
index 0000000..14f215c
--- /dev/null
+++ b/youtube_dl/extractor/clubic.py
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ qualities,
+)
+
+
+class ClubicIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html'
+
+ _TEST = {
+ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',
+ 'md5': '1592b694ba586036efac1776b0b43cd3',
+ 'info_dict': {
+ 'id': '448474',
+ 'ext': 'mp4',
+ 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité',
+ 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*',
+ 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id
+ player_page = self._download_webpage(player_url, video_id)
+
+ config_json = self._search_regex(
+ r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page,
+ 'configuration')
+ config = json.loads(config_json)
+
+ video_info = config['videoInfo']
+ sources = config['sources']
+ quality_order = qualities(['sd', 'hq'])
+
+ formats = [{
+ 'format_id': src['streamQuality'],
+ 'url': src['src'],
+ 'quality': quality_order(src['streamQuality']),
+ } for src in sources]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_info['title'],
+ 'formats': formats,
+ 'description': clean_html(video_info.get('description')),
+ 'thumbnail': config.get('poster'),
+ }
diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py
index 88e0e9a..e96c59f 100644
--- a/youtube_dl/extractor/cmt.py
+++ b/youtube_dl/extractor/cmt.py
@@ -1,19 +1,19 @@
+from __future__ import unicode_literals
from .mtv import MTVIE
+
class CMTIE(MTVIE):
- IE_NAME = u'cmt.com'
+ IE_NAME = 'cmt.com'
_VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'
_FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/'
- _TESTS = [
- {
- u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
- u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2',
- u'info_dict': {
- u'id': u'989124',
- u'ext': u'mp4',
- u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
- u'description': u'Blame It All On My Roots',
- },
+ _TESTS = [{
+ 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
+ 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2',
+ 'info_dict': {
+ 'id': '989124',
+ 'ext': 'mp4',
+ 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
+ 'description': 'Blame It All On My Roots',
},
- ]
+ }]
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
new file mode 100644
index 0000000..a94f425
--- /dev/null
+++ b/youtube_dl/extractor/cnet.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
+
+
+class CNETIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
+ _TEST = {
+ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
+ 'md5': '041233212a0d06b179c87cbcca1577b8',
+ 'info_dict': {
+ 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
+ 'ext': 'mp4',
+ 'title': 'Hands-on with Microsoft Windows 8.1 Update',
+ 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
+ 'thumbnail': 're:^http://.*/flmswindows8.jpg$',
+ 'uploader_id': 'sarah.mitroff@cbsinteractive.com',
+ 'uploader': 'Sarah Mitroff',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+ data_json = self._html_search_regex(
+ r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'",
+ webpage, 'data json')
+ data = json.loads(data_json)
+ vdata = data['video']
+ if not vdata:
+ vdata = data['videos'][0]
+ if not vdata:
+ raise ExtractorError('Cannot find video data')
+
+ video_id = vdata['id']
+ title = vdata['headline']
+ description = vdata.get('dek')
+ thumbnail = vdata.get('image', {}).get('path')
+ author = vdata.get('author')
+ if author:
+ uploader = '%s %s' % (author['firstName'], author['lastName'])
+ uploader_id = author.get('email')
+ else:
+ uploader = None
+ uploader_id = None
+
+ formats = [{
+ 'format_id': '%s-%s-%s' % (
+ f['type'], f['format'],
+ int_or_none(f.get('bitrate'), 1000, default='')),
+ 'url': f['uri'],
+ 'tbr': int_or_none(f.get('bitrate'), 1000),
+ } for f in vdata['files']['data']]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index b32cb89..dae40c1 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -79,8 +79,11 @@ class CNNIE(InfoExtractor):
self._sort_formats(formats)
- thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
- thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
+ thumbnails = [{
+ 'height': int(t.attrib['height']),
+ 'width': int(t.attrib['width']),
+ 'url': t.text,
+ } for t in info.findall('images/image')]
metas_el = info.find('metas')
upload_date = (
@@ -93,8 +96,7 @@ class CNNIE(InfoExtractor):
'id': info.attrib['id'],
'title': info.find('headline').text,
'formats': formats,
- 'thumbnail': thumbnails[-1][1],
- 'thumbnails': thumbs_dict,
+ 'thumbnails': thumbnails,
'description': info.find('description').text,
'duration': duration,
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
index 10c925d..6f866e7 100644
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -17,8 +17,9 @@ class CollegeHumorIE(InfoExtractor):
'id': '6902724',
'ext': 'mp4',
'title': 'Comic-Con Cosplay Catastrophe',
- 'description': 'Fans get creative this year',
+ 'description': "Fans get creative this year at San Diego. Too creative. And yes, that's really Joss Whedon.",
'age_limit': 13,
+ 'duration': 187,
},
},
{
@@ -28,22 +29,22 @@ class CollegeHumorIE(InfoExtractor):
'id': '3505939',
'ext': 'mp4',
'title': 'Font Conference',
- 'description': 'This video wasn\'t long enough,',
+ 'description': "This video wasn't long enough, so we made it double-spaced.",
'age_limit': 10,
'duration': 179,
},
},
# embedded youtube video
{
- 'url': 'http://www.collegehumor.com/embed/6950457',
+ 'url': 'http://www.collegehumor.com/embed/6950306',
'info_dict': {
- 'id': 'W5gMp3ZjYg4',
+ 'id': 'Z-bao9fg6Yc',
'ext': 'mp4',
- 'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
- 'uploader': 'Funnyplox TV',
- 'uploader_id': 'funnyploxtv',
- 'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
- 'upload_date': '20140128',
+ 'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
+ 'uploader': 'Mark Dice',
+ 'uploader_id': 'MarkDice',
+ 'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
+ 'upload_date': '20140127',
},
'params': {
'skip_download': True,
@@ -87,6 +88,7 @@ class CollegeHumorIE(InfoExtractor):
self._sort_formats(formats)
duration = int_or_none(vdata.get('duration'), 1000)
+ like_count = int_or_none(vdata.get('likes'))
return {
'id': video_id,
@@ -96,4 +98,5 @@ class CollegeHumorIE(InfoExtractor):
'formats': formats,
'age_limit': age_limit,
'duration': duration,
+ 'like_count': like_count,
}
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index ed3986f..ba4d73a 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -7,21 +7,21 @@ from .mtv import MTVServicesInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
-
ExtractorError,
+ float_or_none,
unified_strdate,
)
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/
(video-clips|episodes|cc-studios|video-collections)
/(?P<title>.*)'''
_FEED_URL = 'http://comedycentral.com/feeds/mrss/'
_TEST = {
'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
- 'md5': '4167875aae411f903b751a21f357f1ee',
+ 'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
'info_dict': {
'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
'ext': 'mp4',
@@ -32,31 +32,34 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
class ComedyCentralShowsIE(InfoExtractor):
- IE_DESC = 'The Daily Show / Colbert Report'
+ IE_DESC = 'The Daily Show / The Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
- _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
- |(https?://)?(www\.)?
- (?P<showname>thedailyshow|colbertnation)\.com/
- (full-episodes/(?P<episode>.*)|
+ _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
+ |https?://(:www\.)?
+ (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
+ ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
(?P<clip>
- (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
- |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
+ (?:(?:guests/[^/]+|videos|video-playlists|special-editions)/[^/]+/(?P<videotitle>[^/?#]+))
+ |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
+ |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
+ )|
(?P<interview>
- extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
- $"""
+ extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
+ (?:[?#].*|$)'''
_TEST = {
- 'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
- 'file': '422212.mp4',
+ 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
'info_dict': {
- "upload_date": "20121214",
- "description": "Kristen Stewart",
- "uploader": "thedailyshow",
- "title": "thedailyshow-kristen-stewart part 1"
+ 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55',
+ 'ext': 'mp4',
+ 'upload_date': '20121213',
+ 'description': 'Kristen Stewart learns to let loose in "On the Road."',
+ 'uploader': 'thedailyshow',
+ 'title': 'thedailyshow kristen-stewart part 1',
}
}
@@ -79,11 +82,6 @@ class ComedyCentralShowsIE(InfoExtractor):
'400': (384, 216),
}
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
-
@staticmethod
def _transform_rtmp_url(rtmp_video_url):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url)
@@ -99,14 +97,16 @@ class ComedyCentralShowsIE(InfoExtractor):
if mobj.group('shortname'):
if mobj.group('shortname') in ('tds', 'thedailyshow'):
- url = 'http://www.thedailyshow.com/full-episodes/'
+ url = 'http://thedailyshow.cc.com/full-episodes/'
else:
- url = 'http://www.colbertnation.com/full-episodes/'
+ url = 'http://thecolbertreport.cc.com/full-episodes/'
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
assert mobj is not None
if mobj.group('clip'):
- if mobj.group('showname') == 'thedailyshow':
+ if mobj.group('videotitle'):
+ epTitle = mobj.group('videotitle')
+ elif mobj.group('showname') == 'thedailyshow':
epTitle = mobj.group('tdstitle')
else:
epTitle = mobj.group('cntitle')
@@ -120,9 +120,9 @@ class ComedyCentralShowsIE(InfoExtractor):
epTitle = mobj.group('showname')
else:
epTitle = mobj.group('episode')
+ show_name = mobj.group('showname')
- self.report_extraction(epTitle)
- webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
+ webpage, htmlHandle = self._download_webpage_handle(url, epTitle)
if dlNewest:
url = htmlHandle.geturl()
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -130,71 +130,86 @@ class ComedyCentralShowsIE(InfoExtractor):
raise ExtractorError('Invalid redirected URL: ' + url)
if mobj.group('episode') == '':
raise ExtractorError('Redirected URL is still not specific: ' + url)
- epTitle = mobj.group('episode')
+ epTitle = mobj.group('episode').rpartition('/')[-1]
mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
-
if len(mMovieParams) == 0:
# The Colbert Report embeds the information in a without
# a URL prefix; so extract the alternate reference
# and then add the URL prefix manually.
- altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
+ altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage)
if len(altMovieParams) == 0:
raise ExtractorError('unable to find Flash URL in webpage ' + url)
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
uri = mMovieParams[0][1]
- indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
- idoc = self._download_xml(indexUrl, epTitle,
- 'Downloading show index',
- 'unable to download episode index')
-
- results = []
-
- itemEls = idoc.findall('.//item')
- for partNum,itemEl in enumerate(itemEls):
- mediaId = itemEl.findall('./guid')[0].text
- shortMediaId = mediaId.split(':')[-1]
- showId = mediaId.split(':')[-2].replace('.com', '')
- officialTitle = itemEl.findall('./title')[0].text
- officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
-
- configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
- compat_urllib_parse.urlencode({'uri': mediaId}))
- cdoc = self._download_xml(configUrl, epTitle,
- 'Downloading configuration for %s' % shortMediaId)
+ # Correct cc.com in uri
+ uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri)
+
+ index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
+ idoc = self._download_xml(
+ index_url, epTitle,
+ 'Downloading show index', 'Unable to download episode index')
+
+ title = idoc.find('./channel/title').text
+ description = idoc.find('./channel/description').text
+
+ entries = []
+ item_els = idoc.findall('.//item')
+ for part_num, itemEl in enumerate(item_els):
+ upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text)
+ thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url')
+
+ content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
+ duration = float_or_none(content.attrib.get('duration'))
+ mediagen_url = content.attrib['url']
+ guid = itemEl.find('./guid').text.rpartition(':')[-1]
+
+ cdoc = self._download_xml(
+ mediagen_url, epTitle,
+ 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els)))
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
turls.append(finfo)
- if len(turls) == 0:
- self._downloader.report_error('unable to download ' + mediaId + ': No videos found')
- continue
-
formats = []
for format, rtmp_video_url in turls:
w, h = self._video_dimensions.get(format, (None, None))
formats.append({
+ 'format_id': 'vhttp-%s' % format,
'url': self._transform_rtmp_url(rtmp_video_url),
'ext': self._video_extensions.get(format, 'mp4'),
- 'format_id': format,
'height': h,
'width': w,
})
+ formats.append({
+ 'format_id': 'rtmp-%s' % format,
+ 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'),
+ 'ext': self._video_extensions.get(format, 'mp4'),
+ 'height': h,
+ 'width': w,
+ })
+ self._sort_formats(formats)
- effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1)
- results.append({
- 'id': shortMediaId,
+ virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1)
+ entries.append({
+ 'id': guid,
+ 'title': virtual_id,
'formats': formats,
- 'uploader': showId,
- 'upload_date': officialDate,
- 'title': effTitle,
- 'thumbnail': None,
- 'description': compat_str(officialTitle),
+ 'uploader': show_name,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'description': description,
})
- return results
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': show_name + ' ' + title,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 84fca8b..49e7540 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -74,7 +74,7 @@ class InfoExtractor(object):
"http", "https", "rtsp", "rtmp", "m3u8" or so.
* preference Order number of this format. If this field is
present and not None, the formats get sorted
- by this field.
+ by this field, regardless of all other values.
-1 for default (order by other properties),
-2 or smaller for less than default.
* quality Order number of the video quality of this
@@ -88,12 +88,22 @@ class InfoExtractor(object):
The following fields are optional:
- thumbnails: A list of dictionaries (with the entries "resolution" and
- "url") for the varying thumbnails
+ display_id An alternative identifier for the video, not necessarily
+ unique, but available before title. Typically, id is
+ something like "4234987", title "Dancing naked mole rats",
+ and display_id "dancing-naked-mole-rats"
+ thumbnails: A list of dictionaries, with the following entries:
+ * "url"
+ * "width" (optional, int)
+ * "height" (optional, int)
+ * "resolution" (optional, string "{width}x{height"},
+ deprecated)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
uploader: Full name of the video uploader.
+ timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
+ If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
location: Physical location of the video.
subtitles: The subtitle file contents as a dictionary in the format
@@ -107,6 +117,8 @@ class InfoExtractor(object):
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
+ categories: A list of categories that the video falls in, for example
+ ["Sports", "Berlin"]
Unless mentioned otherwise, the fields should be Unicode strings.
@@ -114,9 +126,6 @@ class InfoExtractor(object):
_real_extract() methods and define a _VALID_URL regexp.
Probably, they should also be added to the list of extractors.
- _real_extract() must return a *list* of information dictionaries as
- described above.
-
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
"""
@@ -239,16 +248,31 @@ class InfoExtractor(object):
url = url_or_request.get_full_url()
except AttributeError:
url = url_or_request
- if len(url) > 200:
- h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
- url = url[:200 - len(h)] + h
- raw_filename = ('%s_%s.dump' % (video_id, url))
+ basen = '%s_%s' % (video_id, url)
+ if len(basen) > 240:
+ h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+ basen = basen[:240 - len(h)] + h
+ raw_filename = basen + '.dump'
filename = sanitize_filename(raw_filename, restricted=True)
self.to_screen(u'Saving request to ' + filename)
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
- content = webpage_bytes.decode(encoding, 'replace')
+ try:
+ content = webpage_bytes.decode(encoding, 'replace')
+ except LookupError:
+ content = webpage_bytes.decode('utf-8', 'replace')
+
+ if (u'<title>Access to this site is blocked</title>' in content and
+ u'Websense' in content[:512]):
+ msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
+ blocked_iframe = self._html_search_regex(
+ r'<iframe src="([^"]+)"', content,
+ u'Websense information URL', default=None)
+ if blocked_iframe:
+ msg += u' Visit %s for more details' % blocked_iframe
+ raise ExtractorError(msg, expected=True)
+
return (content, urlh)
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
@@ -262,9 +286,12 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note=u'Downloading XML', errnote=u'Unable to download XML',
- transform_source=None):
+ transform_source=None, fatal=True):
"""Return the xml as an xml.etree.ElementTree.Element"""
- xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ xml_string = self._download_webpage(
+ url_or_request, video_id, note, errnote, fatal=fatal)
+ if xml_string is False:
+ return xml_string
if transform_source:
xml_string = transform_source(xml_string)
return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
@@ -432,14 +459,14 @@ class InfoExtractor(object):
if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
- def _html_search_meta(self, name, html, display_name=None):
+ def _html_search_meta(self, name, html, display_name=None, fatal=False):
if display_name is None:
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
(?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
[^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=False)
+ html, display_name, fatal=fatal)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
@@ -528,6 +555,23 @@ class InfoExtractor(object):
)
formats.sort(key=_formats_key)
+ def http_scheme(self):
+ """ Either "https:" or "https:", depending on the user's preferences """
+ return (
+ 'http:'
+ if self._downloader.params.get('prefer_insecure', False)
+ else 'https:')
+
+ def _proto_relative_url(self, url, scheme=None):
+ if url is None:
+ return url
+ if url.startswith('//'):
+ if scheme is None:
+ scheme = self.http_scheme()
+ return scheme + url
+ else:
+ return url
+
class SearchInfoExtractor(InfoExtractor):
"""
@@ -571,3 +615,4 @@ class SearchInfoExtractor(InfoExtractor):
@property
def SEARCH_KEY(self):
return self._SEARCH_KEY
+
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 91c1c13..ffbe490 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -28,16 +28,18 @@ class CondeNastIE(InfoExtractor):
'glamour': 'Glamour',
'wmagazine': 'W Magazine',
'vanityfair': 'Vanity Fair',
+ 'cnevids': 'Condé Nast',
}
- _VALID_URL = r'http://(video|www)\.(?P<site>%s)\.com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
_TEST = {
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
- 'file': '5171b343c2b4c00dd0c1ccb3.mp4',
'md5': '1921f713ed48aabd715691f774c451f7',
'info_dict': {
+ 'id': '5171b343c2b4c00dd0c1ccb3',
+ 'ext': 'mp4',
'title': '3D Printed Speakers Lit With LED',
'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
}
@@ -55,12 +57,16 @@ class CondeNastIE(InfoExtractor):
entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
return self.playlist_result(entries, playlist_title=title)
- def _extract_video(self, webpage):
- description = self._html_search_regex([r'<div class="cne-video-description">(.+?)</div>',
- r'<div class="video-post-content">(.+?)</div>',
- ],
- webpage, 'description',
- fatal=False, flags=re.DOTALL)
+ def _extract_video(self, webpage, url_type):
+ if url_type != 'embed':
+ description = self._html_search_regex(
+ [
+ r'<div class="cne-video-description">(.+?)</div>',
+ r'<div class="video-post-content">(.+?)</div>',
+ ],
+ webpage, 'description', fatal=False, flags=re.DOTALL)
+ else:
+ description = None
params = self._search_regex(r'var params = {(.+?)}[;,]', webpage,
'player params', flags=re.DOTALL)
video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id')
@@ -99,12 +105,12 @@ class CondeNastIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
site = mobj.group('site')
url_type = mobj.group('type')
- id = mobj.group('id')
+ item_id = mobj.group('id')
- self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site])
- webpage = self._download_webpage(url, id)
+ self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
+ webpage = self._download_webpage(url, item_id)
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- return self._extract_video(webpage)
+ return self._extract_video(webpage, url_type)
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 920728e..026a917 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -1,7 +1,11 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re, base64, zlib
+import re
+import json
+import base64
+import zlib
+
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
@@ -19,13 +23,15 @@ from ..aes import (
inc,
)
+
class CrunchyrollIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
- _TESTS = [{
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+ _TEST = {
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
- 'file': '645513.flv',
#'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
'info_dict': {
+ 'id': '645513',
+ 'ext': 'flv',
'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef',
'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
@@ -36,7 +42,7 @@ class CrunchyrollIE(InfoExtractor):
# rtmp
'skip_download': True,
},
- }]
+ }
_FORMAT_IDS = {
'360': ('60', '106'),
@@ -68,7 +74,7 @@ class CrunchyrollIE(InfoExtractor):
shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
# Extend 160 Bit hash to 256 Bit
return shaHash + [0] * 12
-
+
key = obfuscate_key(id)
class Counter:
__value = iv
@@ -80,9 +86,8 @@ class CrunchyrollIE(InfoExtractor):
return zlib.decompress(decrypted_data)
def _convert_subtitles_to_srt(self, subtitles):
- i=1
output = ''
- for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles):
+ for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
start = start.replace('.', ',')
end = end.replace('.', ',')
text = clean_html(text)
@@ -90,7 +95,6 @@ class CrunchyrollIE(InfoExtractor):
if not text:
continue
output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
- i+=1
return output
def _real_extract(self,url):
@@ -108,6 +112,12 @@ class CrunchyrollIE(InfoExtractor):
if note_m:
raise ExtractorError(note_m)
+ mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
+ if mobj:
+ msg = json.loads(mobj.group('msg'))
+ if msg.get('type') == 'error':
+ raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
+
video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
video_title = re.sub(r' {2,}', ' ', video_title)
video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
@@ -123,7 +133,7 @@ class CrunchyrollIE(InfoExtractor):
playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
-
+
stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
@@ -161,7 +171,7 @@ class CrunchyrollIE(InfoExtractor):
data = base64.b64decode(data)
subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
- lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False)
+ lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
if not lang_code:
continue
subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d65046f..b6552c5 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -4,15 +4,16 @@ import re
from .common import InfoExtractor
from ..utils import (
+ int_or_none,
unescapeHTML,
find_xpath_attr,
)
class CSpanIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)'
+ _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
IE_DESC = 'C-SPAN'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
'md5': '8e44ce11f0f725527daccc453f553eb0',
'info_dict': {
@@ -22,13 +23,24 @@ class CSpanIE(InfoExtractor):
'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
},
'skip': 'Regularly fails on travis, for unknown reasons',
- }
+ }, {
+ 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
+ # For whatever reason, the served video alternates between
+ # two different ones
+ #'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c',
+ 'info_dict': {
+ 'id': '340723',
+ 'ext': 'mp4',
+ 'title': 'International Health Care Models',
+ 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_id = mobj.group('id')
webpage = self._download_webpage(url, page_id)
- video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id')
+ video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id')
description = self._html_search_regex(
[
@@ -43,18 +55,29 @@ class CSpanIE(InfoExtractor):
info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
data = self._download_json(info_url, video_id)
- url = unescapeHTML(data['video']['files'][0]['path']['#text'])
-
- doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
+ doc = self._download_xml(
+ 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
video_id)
- def find_string(s):
- return find_xpath_attr(doc, './/string', 'name', s).text
+ title = find_xpath_attr(doc, './/string', 'name', 'title').text
+ thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
+
+ files = data['video']['files']
+
+ entries = [{
+ 'id': '%s_%d' % (video_id, partnum + 1),
+ 'title': (
+ title if len(files) == 1 else
+ '%s part %d' % (title, partnum + 1)),
+ 'url': unescapeHTML(f['path']['#text']),
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': int_or_none(f.get('length', {}).get('#text')),
+ } for partnum, f in enumerate(files)]
return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': title,
'id': video_id,
- 'title': find_string('title'),
- 'url': url,
- 'description': description,
- 'thumbnail': find_string('poster'),
}
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 6685c94..5521620 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -8,12 +8,11 @@ from .subtitles import SubtitlesInfoExtractor
from ..utils import (
compat_urllib_request,
compat_str,
- get_element_by_attribute,
- get_element_by_id,
orderedSet,
str_to_int,
-
+ int_or_none,
ExtractorError,
+ unescapeHTML,
)
class DailymotionBaseInfoExtractor(InfoExtractor):
@@ -124,7 +123,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
if video_url is not None:
m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
if m_size is not None:
- width, height = m_size.group(1), m_size.group(2)
+ width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
else:
width, height = None, None
formats.append({
@@ -179,7 +178,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = u'dailymotion:playlist'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
- _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
+ _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
def _extract_entries(self, id):
@@ -189,10 +188,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
webpage = self._download_webpage(request,
id, u'Downloading page %s' % pagenum)
- playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
- video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
+ video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
- if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+ if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
for video_id in orderedSet(video_ids)]
@@ -202,26 +200,26 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
- return {'_type': 'playlist',
- 'id': playlist_id,
- 'title': get_element_by_id(u'playlist_name', webpage),
- 'entries': self._extract_entries(playlist_id),
- }
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': self._og_search_title(webpage),
+ 'entries': self._extract_entries(playlist_id),
+ }
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = u'dailymotion:user'
- _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
- _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
webpage = self._download_webpage(url, user)
- full_user = self._html_search_regex(
- r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user),
- webpage, u'user', flags=re.DOTALL)
+ full_user = unescapeHTML(self._html_search_regex(
+ r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
+ webpage, u'user', flags=re.DOTALL))
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 4876ecb..6033cd9 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -1,25 +1,28 @@
# encoding: utf-8
+
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
- determine_ext,
)
class DaumIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
- IE_NAME = u'daum.net'
+ IE_NAME = 'daum.net'
_TEST = {
- u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
- u'file': u'52554690.mp4',
- u'info_dict': {
- u'title': u'DOTA 2GETHER 시즌2 6회 - 2부',
- u'description': u'DOTA 2GETHER 시즌2 6회 - 2부',
- u'upload_date': u'20130831',
- u'duration': 3868,
+ 'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
+ 'info_dict': {
+ 'id': '52554690',
+ 'ext': 'mp4',
+ 'title': 'DOTA 2GETHER 시즌2 6회 - 2부',
+ 'description': 'DOTA 2GETHER 시즌2 6회 - 2부',
+ 'upload_date': '20130831',
+ 'duration': 3868,
},
}
@@ -30,14 +33,14 @@ class DaumIE(InfoExtractor):
webpage = self._download_webpage(canonical_url, video_id)
full_id = self._search_regex(
r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
- webpage, u'full id')
+ webpage, 'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
info = self._download_xml(
'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
- u'Downloading video info')
+ 'Downloading video info')
urls = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
- video_id, u'Downloading video formats info')
+ video_id, 'Downloading video formats info')
self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
@@ -53,7 +56,6 @@ class DaumIE(InfoExtractor):
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
- 'ext': determine_ext(format_url),
'format_id': profile,
})
diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py
deleted file mode 100644
index 2c9fb5f..0000000
--- a/youtube_dl/extractor/depositfiles.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import re
-import os
-import socket
-
-from .common import InfoExtractor
-from ..utils import (
- compat_http_client,
- compat_str,
- compat_urllib_error,
- compat_urllib_parse,
- compat_urllib_request,
-
- ExtractorError,
-)
-
-
-class DepositFilesIE(InfoExtractor):
- """Information extractor for depositfiles.com"""
-
- _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
-
- def _real_extract(self, url):
- file_id = url.split('/')[-1]
- # Rebuild url in english locale
- url = 'http://depositfiles.com/en/files/' + file_id
-
- # Retrieve file webpage with 'Free download' button pressed
- free_download_indication = {'gateway_result' : '1'}
- request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
- try:
- self.report_download_webpage(file_id)
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
-
- # Search for the real file URL
- mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
- if (mobj is None) or (mobj.group(1) is None):
- # Try to figure out reason of the error.
- mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
- if (mobj is not None) and (mobj.group(1) is not None):
- restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
- raise ExtractorError(u'%s' % restriction_message)
- else:
- raise ExtractorError(u'Unable to extract download URL from: %s' % url)
-
- file_url = mobj.group(1)
- file_extension = os.path.splitext(file_url)[1][1:]
-
- # Search for file title
- file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
-
- return [{
- 'id': file_id.decode('utf-8'),
- 'url': file_url.decode('utf-8'),
- 'uploader': None,
- 'upload_date': None,
- 'title': file_title,
- 'ext': file_extension.decode('utf-8'),
- }]
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index 885944c..2ae6ecc 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -10,9 +10,10 @@ class DiscoveryIE(InfoExtractor):
_VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
_TEST = {
'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
- 'file': '614784.mp4',
'md5': 'e12614f9ee303a6ccef415cb0793eba2',
'info_dict': {
+ 'id': '614784',
+ 'ext': 'mp4',
'title': 'MythBusters: Mission Impossible Outtakes',
'description': ('Watch Jamie Hyneman and Adam Savage practice being'
' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -34,7 +35,7 @@ class DiscoveryIE(InfoExtractor):
formats = []
for f in info['mp4']:
formats.append(
- {'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])})
+ {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])})
return {
'id': info['contentId'],
diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py
new file mode 100644
index 0000000..4ca3f37
--- /dev/null
+++ b/youtube_dl/extractor/divxstage.py
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from .novamov import NovaMovIE
+
+
+class DivxStageIE(NovaMovIE):
+ IE_NAME = 'divxstage'
+ IE_DESC = 'DivxStage'
+
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag)'}
+
+ _HOST = 'www.divxstage.eu'
+
+ _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+ _TITLE_REGEX = r'<div class="video_det">\s*<strong>([^<]+)</strong>'
+ _DESCRIPTION_REGEX = r'<div class="video_det">\s*<strong>[^<]+</strong>\s*<p>([^<]+)</p>'
+
+ _TEST = {
+ 'url': 'http://www.divxstage.eu/video/57f238e2e5e01',
+ 'md5': '63969f6eb26533a1968c4d325be63e72',
+ 'info_dict': {
+ 'id': '57f238e2e5e01',
+ 'ext': 'flv',
+ 'title': 'youtubedl test video',
+ 'description': 'This is a test video for youtubedl.',
+ }
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py
index 2bb77ae..f8f49a0 100644
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -1,23 +1,25 @@
+from __future__ import unicode_literals
+
import re
from ..utils import (
compat_urllib_parse,
- determine_ext
)
from .common import InfoExtractor
class EHowIE(InfoExtractor):
- IE_NAME = u'eHow'
- _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
+ IE_NAME = 'eHow'
+ _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
_TEST = {
- u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
- u'file': u'12245069.flv',
- u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
- u'info_dict': {
- u"title": u"Hardwood Flooring Basics",
- u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
- u"uploader": u"Erick Nathan"
+ 'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
+ 'md5': '9809b4e3f115ae2088440bcb4efbf371',
+ 'info_dict': {
+ 'id': '12245069',
+ 'ext': 'flv',
+ 'title': 'Hardwood Flooring Basics',
+ 'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...',
+ 'uploader': 'Erick Nathan',
}
}
@@ -26,21 +28,16 @@ class EHowIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
- webpage, u'video URL')
- final_url = compat_urllib_parse.unquote(video_url)
- uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
- webpage, u'uploader')
+ webpage, 'video URL')
+ final_url = compat_urllib_parse.unquote(video_url)
+ uploader = self._html_search_meta('uploader', webpage)
title = self._og_search_title(webpage).replace(' | eHow', '')
- ext = determine_ext(final_url)
return {
- '_type': 'video',
- 'id': video_id,
- 'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'id': video_id,
+ 'url': final_url,
+ 'title': title,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
- 'uploader': uploader,
+ 'uploader': uploader,
}
-
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
new file mode 100644
index 0000000..e695258
--- /dev/null
+++ b/youtube_dl/extractor/empflix.py
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EmpflixIE(InfoExtractor):
+ _VALID_URL = r'^https?://www\.empflix\.com/videos/.*?-(?P<id>[0-9]+)\.html'
+ _TEST = {
+ 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+ 'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+ 'info_dict': {
+ 'id': '33051',
+ 'ext': 'mp4',
+ 'title': 'Amateur Finger Fuck',
+ 'description': 'Amateur solo finger fucking.',
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ age_limit = self._rta_search(webpage)
+
+ video_title = self._html_search_regex(
+ r'name="title" value="(?P<title>[^"]*)"', webpage, 'title')
+ video_description = self._html_search_regex(
+ r'name="description" value="([^"]*)"', webpage, 'description', fatal=False)
+
+ cfg_url = self._html_search_regex(
+ r'flashvars\.config = escape\("([^"]+)"',
+ webpage, 'flashvars.config')
+
+ cfg_xml = self._download_xml(
+ cfg_url, video_id, note='Downloading metadata')
+
+ formats = [
+ {
+ 'url': item.find('videoLink').text,
+ 'format_id': item.find('res').text,
+ } for item in cfg_xml.findall('./quality/item')
+ ]
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'formats': formats,
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
new file mode 100644
index 0000000..92ada81
--- /dev/null
+++ b/youtube_dl/extractor/engadget.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .fivemin import FiveMinIE
+from ..utils import (
+ url_basename,
+)
+
+
+class EngadgetIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://www.engadget.com/
+ (?:video/5min/(?P<id>\d+)|
+ [\d/]+/.*?)
+ '''
+
+ _TEST = {
+ 'url': 'http://www.engadget.com/video/5min/518153925/',
+ 'md5': 'c6820d4828a5064447a4d9fc73f312c9',
+ 'info_dict': {
+ 'id': '518153925',
+ 'ext': 'mp4',
+ 'title': 'Samsung Galaxy Tab Pro 8.4 Review',
+ },
+ 'add_ie': ['FiveMin'],
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ if video_id is not None:
+ return FiveMinIE._build_result(video_id)
+ else:
+ title = url_basename(url)
+ webpage = self._download_webpage(url, title)
+ ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': [FiveMinIE._build_result(id) for id in ids]
+ }
diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py
index 1c20e43..14a196f 100644
--- a/youtube_dl/extractor/extremetube.py
+++ b/youtube_dl/extractor/extremetube.py
@@ -1,4 +1,5 @@
-import os
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -8,18 +9,23 @@ from ..utils import (
compat_urllib_parse,
)
+
class ExtremeTubeIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
- _TEST = {
- u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
- u'file': u'652431.mp4',
- u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0',
- u'info_dict': {
- u"title": u"Music Video 14 british euro brit european cumshots swallow",
- u"uploader": u"unknown",
- u"age_limit": 18,
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _TESTS = [{
+ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+ 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
+ 'info_dict': {
+ 'id': '652431',
+ 'ext': 'mp4',
+ 'title': 'Music Video 14 british euro brit european cumshots swallow',
+ 'uploader': 'unknown',
+ 'age_limit': 18,
}
- }
+ }, {
+ 'url': 'http://www.extremetube.com/gay/video/abcde-1234',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -30,11 +36,14 @@ class ExtremeTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title')
- uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False)
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+ video_title = self._html_search_regex(
+ r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title')
+ uploader = self._html_search_regex(
+ r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader',
+ fatal=False)
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(
+ r'video_url=(.+?)&amp;', webpage, 'video_url'))
path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
format = path.split('/')[5].split('_')[:2]
format = "-".join(format)
@@ -43,7 +52,6 @@ class ExtremeTubeIE(InfoExtractor):
'title': video_title,
'uploader': uploader,
'url': video_url,
- 'ext': extension,
'format': format,
'format_id': format,
'age_limit': 18,
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 8f9154c..f0cd8f1 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import json
import re
import socket
@@ -9,16 +11,15 @@ from ..utils import (
compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
+ urlencode_postdata,
ExtractorError,
)
class FacebookIE(InfoExtractor):
- """Information Extractor for Facebook"""
-
_VALID_URL = r'''(?x)
- (?:https?://)?(?:\w+\.)?facebook\.com/
+ https?://(?:\w+\.)?facebook\.com/
(?:[^#?]*\#!/)?
(?:video/video\.php|photo\.php|video/embed)\?(?:.*?)
(?:v|video_id)=(?P<id>[0-9]+)
@@ -26,21 +27,18 @@ class FacebookIE(InfoExtractor):
_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
_NETRC_MACHINE = 'facebook'
- IE_NAME = u'facebook'
+ IE_NAME = 'facebook'
_TEST = {
- u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
- u'file': u'120708114770723.mp4',
- u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
- u'info_dict': {
- u"duration": 279,
- u"title": u"PEOPLE ARE AWESOME 2013"
+ 'url': 'https://www.facebook.com/photo.php?v=120708114770723',
+ 'md5': '48975a41ccc4b7a581abd68651c1a5a8',
+ 'info_dict': {
+ 'id': '120708114770723',
+ 'ext': 'mp4',
+ 'duration': 279,
+ 'title': 'PEOPLE ARE AWESOME 2013',
}
}
- def report_login(self):
- """Report attempt to log in."""
- self.to_screen(u'Logging in')
-
def _login(self):
(useremail, password) = self._get_login_info()
if useremail is None:
@@ -48,11 +46,13 @@ class FacebookIE(InfoExtractor):
login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
login_page_req.add_header('Cookie', 'locale=en_US')
- self.report_login()
- login_page = self._download_webpage(login_page_req, None, note=False,
- errnote=u'Unable to download login page')
- lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
- lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
+ login_page = self._download_webpage(login_page_req, None,
+ note='Downloading login page',
+ errnote='Unable to download login page')
+ lsd = self._search_regex(
+ r'<input type="hidden" name="lsd" value="([^"]*)"',
+ login_page, 'lsd')
+ lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
login_form = {
'email': useremail,
@@ -65,27 +65,28 @@ class FacebookIE(InfoExtractor):
'timezone': '-60',
'trynum': '1',
}
- request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
- login_results = compat_urllib_request.urlopen(request).read()
+ login_results = self._download_webpage(request, None,
+ note='Logging in', errnote='unable to fetch login page')
if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+ self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
return
check_form = {
- 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
- 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
+ 'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),
+ 'h': self._search_regex(r'name="h" value="(\w*?)"', login_results, 'h'),
'name_action_selected': 'dont_save',
- 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
}
- check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
+ check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- check_response = compat_urllib_request.urlopen(check_req).read()
+ check_response = self._download_webpage(check_req, None,
+ note='Confirming login')
if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
- self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
+ self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+ self._downloader.report_warning('unable to log in: %s' % compat_str(err))
return
def _real_initialize(self):
@@ -93,8 +94,6 @@ class FacebookIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
@@ -107,10 +106,10 @@ class FacebookIE(InfoExtractor):
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
- u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+ 'The video is not available, Facebook said: "%s"' % m_msg.group(1),
expected=True)
else:
- raise ExtractorError(u'Cannot parse data')
+ raise ExtractorError('Cannot parse data')
data = dict(json.loads(m.group(1)))
params_raw = compat_urllib_parse.unquote(data['params'])
params = json.loads(params_raw)
@@ -119,19 +118,15 @@ class FacebookIE(InfoExtractor):
if not video_url:
video_url = video_data['sd_src']
if not video_url:
- raise ExtractorError(u'Cannot find video URL')
- video_duration = int(video_data['video_duration'])
- thumbnail = video_data['thumbnail_src']
+ raise ExtractorError('Cannot find video URL')
video_title = self._html_search_regex(
- r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
+ r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
- info = {
+ return {
'id': video_id,
'title': video_title,
'url': video_url,
- 'ext': 'mp4',
- 'duration': video_duration,
- 'thumbnail': thumbnail,
+ 'duration': int(video_data['video_duration']),
+ 'thumbnail': video_data['thumbnail_src'],
}
- return [info]
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
new file mode 100644
index 0000000..18f91ef
--- /dev/null
+++ b/youtube_dl/extractor/fc2.py
@@ -0,0 +1,60 @@
+#! -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+import hashlib
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ compat_urllib_request,
+ compat_urlparse,
+)
+
+
+class FC2IE(InfoExtractor):
+ _VALID_URL = r'^http://video\.fc2\.com/((?P<lang>[^/]+)/)?content/(?P<id>[^/]+)'
+ IE_NAME = 'fc2'
+ _TEST = {
+ 'url': 'http://video.fc2.com/en/content/20121103kUan1KHs',
+ 'md5': 'a6ebe8ebe0396518689d963774a54eb7',
+ 'info_dict': {
+ 'id': '20121103kUan1KHs',
+ 'ext': 'flv',
+ 'title': 'Boxing again with Puff',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ self._downloader.cookiejar.clear_session_cookies() # must clear
+
+ title = self._og_search_title(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+ refer = url.replace('/content/', '/a/content/')
+
+ mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
+
+ info_url = (
+ "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
+ format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.','%2E')))
+
+ info_webpage = self._download_webpage(
+ info_url, video_id, note='Downloading info page')
+ info = compat_urlparse.parse_qs(info_webpage)
+
+ if 'err_code' in info:
+ raise ExtractorError('Error code: %s' % info['err_code'][0])
+
+ video_url = info['filepath'][0] + '?mid=' + info['mid'][0]
+
+ return {
+ 'id': video_id,
+ 'title': info['title'][0],
+ 'url': video_url,
+ 'ext': 'flv',
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py
index 7e3d1af..eccd8dd 100644
--- a/youtube_dl/extractor/firstpost.py
+++ b/youtube_dl/extractor/firstpost.py
@@ -6,7 +6,6 @@ from .common import InfoExtractor
class FirstpostIE(InfoExtractor):
- IE_NAME = 'Firstpost.com'
_VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
_TEST = {
@@ -16,7 +15,6 @@ class FirstpostIE(InfoExtractor):
'id': '1025403',
'ext': 'mp4',
'title': 'India to launch indigenous aircraft carrier INS Vikrant today',
- 'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.',
}
}
@@ -24,15 +22,26 @@ class FirstpostIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'<div.*?name="div_video".*?flashvars="([^"]+)">',
- webpage, 'video URL')
+ data = self._download_xml(
+ 'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id,
+ 'Downloading video XML')
+
+ item = data.find('./playlist/item')
+ thumbnail = item.find('./image').text
+ title = item.find('./title').text
+
+ formats = [
+ {
+ 'url': details.find('./file').text,
+ 'format_id': details.find('./label').text.strip(),
+ 'width': int(details.find('./width').text.strip()),
+ 'height': int(details.find('./height').text.strip()),
+ } for details in item.findall('./source/file_details') if details.find('./file').text
+ ]
return {
'id': video_id,
- 'url': video_url,
- 'title': self._og_search_title(webpage),
- 'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
new file mode 100644
index 0000000..3a50bab
--- /dev/null
+++ b/youtube_dl/extractor/fivemin.py
@@ -0,0 +1,88 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+ compat_urllib_parse,
+ ExtractorError,
+)
+
+
+class FiveMinIE(InfoExtractor):
+ IE_NAME = '5min'
+ _VALID_URL = r'''(?x)
+ (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=|
+ 5min:)
+ (?P<id>\d+)
+ '''
+
+ _TESTS = [
+ {
+ # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/
+ 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791',
+ 'md5': '4f7b0b79bf1a470e5004f7112385941d',
+ 'info_dict': {
+ 'id': '518013791',
+ 'ext': 'mp4',
+ 'title': 'iPad Mini with Retina Display Review',
+ },
+ },
+ {
+ # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247
+ 'url': '5min:518086247',
+ 'md5': 'e539a9dd682c288ef5a498898009f69e',
+ 'info_dict': {
+ 'id': '518086247',
+ 'ext': 'mp4',
+ 'title': 'How to Make a Next-Level Fruit Salad',
+ },
+ },
+ ]
+
+ @classmethod
+ def _build_result(cls, video_id):
+ return cls.url_result('5min:%s' % video_id, cls.ie_key())
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
+ embed_page = self._download_webpage(embed_url, video_id,
+ 'Downloading embed page')
+ sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
+ query = compat_urllib_parse.urlencode({
+ 'func': 'GetResults',
+ 'playlist': video_id,
+ 'sid': sid,
+ 'isPlayerSeed': 'true',
+ 'url': embed_url,
+ })
+ response = self._download_json(
+ 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
+ video_id)
+ if not response['success']:
+ err_msg = response['errorMessage']
+ if err_msg == 'ErrorVideoUserNotGeo':
+ msg = 'Video not available from your location'
+ else:
+ msg = 'Aol said: %s' % err_msg
+ raise ExtractorError(msg, expected=True, video_id=video_id)
+ info = response['binding'][0]
+
+ second_id = compat_str(int(video_id[:-2]) + 1)
+ formats = []
+ for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]:
+ if any(r['ID'] == quality for r in info['Renditions']):
+ formats.append({
+ 'format_id': compat_str(quality),
+ 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality),
+ 'height': height,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': info['Title'],
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index 8db7fc6..7d56b9b 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -8,8 +8,8 @@ from ..utils import (
unified_strdate,
str_to_int,
parse_duration,
+ clean_html,
)
-from youtube_dl.utils import clean_html
class FourTubeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
new file mode 100644
index 0000000..898e0dd
--- /dev/null
+++ b/youtube_dl/extractor/franceculture.py
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+ compat_urlparse,
+)
+
+
+class FranceCultureIE(InfoExtractor):
+ _VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
+ 'info_dict': {
+ 'id': '4795174',
+ 'ext': 'mp3',
+ 'title': 'Rendez-vous au pays des geeks',
+ 'vcodec': 'none',
+ 'uploader': 'Colette Fellous',
+ 'upload_date': '20140301',
+ 'duration': 3601,
+ 'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
+ 'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ baseurl = mobj.group('baseurl')
+
+ webpage = self._download_webpage(url, video_id)
+ params_code = self._search_regex(
+ r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
+ webpage, 'parameter code')
+ params = compat_parse_qs(params_code)
+ video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
+
+ title = self._html_search_regex(
+ r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
+ uploader = self._html_search_regex(
+ r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
+ webpage, 'uploader', fatal=False)
+ thumbnail_part = self._html_search_regex(
+ r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
+ 'thumbnail', fatal=False)
+ if thumbnail_part is None:
+ thumbnail = None
+ else:
+ thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
+ description = self._html_search_regex(
+ r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
+
+ info = json.loads(params['infoData'][0])[0]
+ duration = info.get('media_length')
+ upload_date_candidate = info.get('media_section5')
+ upload_date = (
+ upload_date_candidate
+ if (upload_date_candidate is not None and
+ re.match(r'[0-9]{8}$', upload_date_candidate))
+ else None)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 51eb97b..f3e0f38 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -48,24 +48,36 @@ class PluzzIE(FranceTVBaseInfoExtractor):
class FranceTvInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
- _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'
+ _VALID_URL = r'https?://www\.francetvinfo\.fr/.*/(?P<title>.+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
- 'file': '84981923.mp4',
'info_dict': {
+ 'id': '84981923',
+ 'ext': 'mp4',
'title': 'Soir 3',
},
'params': {
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
+ 'info_dict': {
+ 'id': 'EV_20019',
+ 'ext': 'mp4',
+ 'title': 'Débat des candidats à la Commission européenne',
+ 'description': 'Débat des candidats à la Commission européenne',
+ },
+ 'params': {
+ 'skip_download': 'HLS (reqires ffmpeg)'
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
- video_id = self._search_regex(r'id-video=(\d+?)[@"]', webpage, 'video id')
+ video_id = self._search_regex(r'id-video=((?:[^0-9]*?_)?[0-9]+)[@"]', webpage, 'video id')
return self._extract_video(video_id)
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 7c40e67..6e6b666 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -1,24 +1,35 @@
from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
+from ..utils import ExtractorError
class FunnyOrDieIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
+ _TESTS = [{
'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
- 'file': '0732f586d7.mp4',
- 'md5': 'f647e9e90064b53b6e046e75d0241fbd',
+ 'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9',
'info_dict': {
- 'description': ('Lyrics changed to match the video. Spoken cameo '
- 'by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a '
- 'concept by Dustin McLean (DustFilms.com). Performed, edited, '
- 'and written by David A. Scott.'),
+ 'id': '0732f586d7',
+ 'ext': 'mp4',
'title': 'Heart-Shaped Box: Literal Video Version',
+ 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
+ 'thumbnail': 're:^http:.*\.jpg$',
},
- }
+ }, {
+ 'url': 'http://www.funnyordie.com/embed/e402820827',
+ 'md5': 'ff4d83318f89776ed0250634cfaa8d36',
+ 'info_dict': {
+ 'id': 'e402820827',
+ 'ext': 'mp4',
+ 'title': 'Please Use This Song (Jon Lajoie)',
+ 'description': 'md5:2ed27d364f5a805a6dba199faaf6681d',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -26,14 +37,34 @@ class FunnyOrDieIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(
- [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''],
- webpage, 'video URL', flags=re.DOTALL)
+ links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage)
+ if not links:
+ raise ExtractorError('No media links available for %s' % video_id)
+
+ links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
+
+ bitrates = self._html_search_regex(r'<source src="[^"]+/v,((?:\d+,)+)\.mp4\.csmil', webpage, 'video bitrates')
+ bitrates = [int(b) for b in bitrates.rstrip(',').split(',')]
+ bitrates.sort()
+
+ formats = []
+
+ for bitrate in bitrates:
+ for link in links:
+ formats.append({
+ 'url': '%s%d.%s' % (link[0], bitrate, link[1]),
+ 'format_id': '%s-%d' % (link[1], bitrate),
+ 'vbr': bitrate,
+ })
+
+ post_json = self._search_regex(
+ r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
+ post = json.loads(post_json)
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
- 'title': self._og_search_title(webpage),
- 'description': self._og_search_description(webpage),
+ 'title': post['name'],
+ 'description': post.get('description'),
+ 'thumbnail': post.get('picture'),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py
index a3a5251..11fee3d 100644
--- a/youtube_dl/extractor/gamekings.py
+++ b/youtube_dl/extractor/gamekings.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -6,13 +8,14 @@ from .common import InfoExtractor
class GamekingsIE(InfoExtractor):
_VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
_TEST = {
- u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
- u'file': u'20130811.mp4',
+ 'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/',
# MD5 is flaky, seems to change regularly
- #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
+ # 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3',
u'info_dict': {
- u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
- u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
+ 'id': '20130811',
+ 'ext': 'mp4',
+ 'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review',
+ 'description': 'md5:36fd701e57e8c15ac8682a2374c99731',
}
}
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index c9598ad..3d67b9d 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -15,11 +15,12 @@ from ..utils import (
class GameSpotIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = {
- "url": "http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
- "file": "gs-2300-6410818.mp4",
- "md5": "b2a30deaa8654fcccd43713a6b6a4825",
- "info_dict": {
- "title": "Arma 3 - Community Guide: SITREP I",
+ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
+ 'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
+ 'info_dict': {
+ 'id': 'gs-2300-6410818',
+ 'ext': 'mp4',
+ 'title': 'Arma 3 - Community Guide: SITREP I',
'description': 'Check out this video where some of the basics of Arma 3 is explained.',
}
}
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
new file mode 100644
index 0000000..89d5994
--- /dev/null
+++ b/youtube_dl/extractor/gdcvault.py
@@ -0,0 +1,134 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+
+class GDCVaultIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
+ 'md5': '7ce8388f544c88b7ac11c7ab1b593704',
+ 'info_dict': {
+ 'id': '1019721',
+ 'ext': 'mp4',
+ 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
+ }
+ },
+ {
+ 'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
+ 'info_dict': {
+ 'id': '1015683',
+ 'ext': 'flv',
+ 'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ }
+ },
+ ]
+
+ def _parse_mp4(self, xml_description):
+ video_formats = []
+ mp4_video = xml_description.find('./metadata/mp4video')
+ if mp4_video is None:
+ return None
+
+ mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
+ video_root = mobj.group('root')
+ formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
+ for format in formats:
+ mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
+ url = video_root + mobj.group('path')
+ vbr = format.find('bitrate').text
+ video_formats.append({
+ 'url': url,
+ 'vbr': int(vbr),
+ })
+ return video_formats
+
+ def _parse_flv(self, xml_description):
+ video_formats = []
+ akami_url = xml_description.find('./metadata/akamaiHost').text
+ slide_video_path = xml_description.find('./metadata/slideVideo').text
+ video_formats.append({
+ 'url': 'rtmp://' + akami_url + '/' + slide_video_path,
+ 'format_note': 'slide deck video',
+ 'quality': -2,
+ 'preference': -2,
+ 'format_id': 'slides',
+ })
+ speaker_video_path = xml_description.find('./metadata/speakerVideo').text
+ video_formats.append({
+ 'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
+ 'format_note': 'speaker video',
+ 'quality': -1,
+ 'preference': -1,
+ 'format_id': 'speaker',
+ })
+ return video_formats
+
+ def _login(self, webpage_url, video_id):
+ (username, password) = self._get_login_info()
+ if username is None or password is None:
+ self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
+ return None
+
+ mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
+ login_url = mobj.group('root_url') + 'api/login.php'
+ logout_url = mobj.group('root_url') + 'logout'
+
+ login_form = {
+ 'email': username,
+ 'password': password,
+ }
+
+ request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
+ request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ self._download_webpage(request, video_id, 'Logging in')
+ start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
+ self._download_webpage(logout_url, video_id, 'Logging out')
+
+ return start_page
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ webpage_url = 'http://www.gdcvault.com/play/' + video_id
+ start_page = self._download_webpage(webpage_url, video_id)
+
+ xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
+
+ if xml_root is None:
+ # Probably need to authenticate
+ start_page = self._login(webpage_url, video_id)
+ if start_page is None:
+ self.report_warning('Could not login.')
+ else:
+ # Grab the url from the authenticated page
+ xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
+
+ xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
+ if xml_name is None:
+ # Fallback to the older format
+ xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
+
+ xml_decription_url = xml_root + 'xml/' + xml_name
+ xml_description = self._download_xml(xml_decription_url, video_id)
+
+ video_title = xml_description.find('./metadata/title').text
+ video_formats = self._parse_mp4(xml_description)
+ if video_formats is None:
+ video_formats = self._parse_flv(xml_description)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': video_formats,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 5bcc78b..38a357d 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -12,9 +12,11 @@ from ..utils import (
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
ExtractorError,
HEADRequest,
+ parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
@@ -22,6 +24,8 @@ from ..utils import (
)
from .brightcove import BrightcoveIE
from .ooyala import OoyalaIE
+from .rutv import RUTVIE
+from .smotri import SmotriIE
class GenericIE(InfoExtractor):
@@ -31,9 +35,10 @@ class GenericIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
- 'file': '13601338388002.mp4',
- 'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd',
+ 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
'info_dict': {
+ 'id': '13601338388002',
+ 'ext': 'mp4',
'uploader': 'www.hodiho.fr',
'title': 'R\u00e9gis plante sa Jeep',
}
@@ -42,8 +47,9 @@ class GenericIE(InfoExtractor):
{
'add_ie': ['Bandcamp'],
'url': 'http://bronyrock.com/track/the-pony-mash',
- 'file': '3235767654.mp3',
'info_dict': {
+ 'id': '3235767654',
+ 'ext': 'mp3',
'title': 'The Pony Mash',
'uploader': 'M_Pallante',
},
@@ -69,22 +75,34 @@ class GenericIE(InfoExtractor):
{
# https://github.com/rg3/youtube-dl/issues/2253
'url': 'http://bcove.me/i6nfkrc3',
- 'file': '3101154703001.mp4',
'md5': '0ba9446db037002366bab3b3eb30c88c',
'info_dict': {
+ 'id': '3101154703001',
+ 'ext': 'mp4',
'title': 'Still no power',
'uploader': 'thestar.com',
'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
},
'add_ie': ['Brightcove'],
},
+ {
+ 'url': 'http://www.championat.com/video/football/v/87/87499.html',
+ 'md5': 'fb973ecf6e4a78a67453647444222983',
+ 'info_dict': {
+ 'id': '3414141473001',
+ 'ext': 'mp4',
+ 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
+ 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
+ 'uploader': 'Championat',
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
- 'file': 'trailer.mp4',
'md5': '67d406c2bcb6af27fa886f31aa934bbe',
'info_dict': {
'id': 'trailer',
+ 'ext': 'mp4',
'title': 'trailer',
'upload_date': '20100513',
}
@@ -92,7 +110,6 @@ class GenericIE(InfoExtractor):
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
@@ -100,6 +117,150 @@ class GenericIE(InfoExtractor):
'title': '2cc213299525360.mov', # that's what we get
},
},
+ # google redirect
+ {
+ 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+ 'info_dict': {
+ 'id': 'cmQHVoWB5FY',
+ 'ext': 'mp4',
+ 'upload_date': '20130224',
+ 'uploader_id': 'TheVerge',
+ 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'uploader': 'The Verge',
+ 'title': 'First Firefox OS phones side-by-side',
+ },
+ 'params': {
+ 'skip_download': False,
+ }
+ },
+ # embed.ly video
+ {
+ 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
+ 'info_dict': {
+ 'id': '9ODmcdjQcHQ',
+ 'ext': 'mp4',
+ 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
+ 'upload_date': '20140225',
+ 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
+ 'uploader': 'Tested',
+ 'uploader_id': 'testedcom',
+ },
+ # No need to test YoutubeIE here
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # funnyordie embed
+ {
+ 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
+ 'md5': '7cf780be104d40fea7bae52eed4a470e',
+ 'info_dict': {
+ 'id': '18e820ec3f',
+ 'ext': 'mp4',
+ 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
+ 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
+ },
+ },
+ # RUTV embed
+ {
+ 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
+ 'info_dict': {
+ 'id': '776940',
+ 'ext': 'mp4',
+ 'title': 'Охотское море стало целиком российским',
+ 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ # Embedded TED video
+ {
+ 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
+ 'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
+ 'info_dict': {
+ 'id': '981',
+ 'ext': 'mp4',
+ 'title': 'My web playroom',
+ 'uploader': 'Ze Frank',
+ 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
+ }
+ },
+ # Embeded Ustream video
+ {
+ 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
+ 'md5': '27b99cdb639c9b12a79bca876a073417',
+ 'info_dict': {
+ 'id': '45734260',
+ 'ext': 'flv',
+ 'uploader': 'AU SPA: The NSA and Privacy',
+ 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
+ }
+ },
+ # nowvideo embed hidden behind percent encoding
+ {
+ 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
+ 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
+ 'info_dict': {
+ 'id': '06e53103ca9aa',
+ 'ext': 'flv',
+ 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
+ 'description': 'No description',
+ },
+ },
+ # arte embed
+ {
+ 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
+ 'md5': '7653032cbb25bf6c80d80f217055fa43',
+ 'info_dict': {
+ 'id': '048195-004_PLUS7-F',
+ 'ext': 'flv',
+ 'title': 'X:enius',
+ 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
+ 'upload_date': '20140320',
+ },
+ 'params': {
+ 'skip_download': 'Requires rtmpdump'
+ }
+ },
+ # smotri embed
+ {
+ 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
+ 'md5': 'ec40048448e9284c9a1de77bb188108b',
+ 'info_dict': {
+ 'id': 'v27008541fad',
+ 'ext': 'mp4',
+ 'title': 'Крым и Севастополь вошли в состав России',
+ 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
+ 'duration': 900,
+ 'upload_date': '20140318',
+ 'uploader': 'rbctv_2012_4',
+ 'uploader_id': 'rbctv_2012_4',
+ },
+ },
+ # Condé Nast embed
+ {
+ 'url': 'http://www.wired.com/2014/04/honda-asimo/',
+ 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
+ 'info_dict': {
+ 'id': '53501be369702d3275860000',
+ 'ext': 'mp4',
+ 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
+ }
+ },
+ # Dailymotion embed
+ {
+ 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
+ 'md5': '441aeeb82eb72c422c7f14ec533999cd',
+ 'info_dict': {
+ 'id': 'k2mm4bCdJ6CQ2i7c8o2',
+ 'ext': 'mp4',
+ 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
+ 'uploader': 'Spi0n',
+ },
+ 'add_ie': ['Dailymotion'],
+ }
]
def report_download_webpage(self, video_id):
@@ -125,9 +286,14 @@ class GenericIE(InfoExtractor):
newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
+ try:
+ # This function was deprecated in python 3.3 and removed in 3.4
+ origin_req_host = req.get_origin_req_host()
+ except AttributeError:
+ origin_req_host = req.origin_req_host
return HEADRequest(newurl,
headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
+ origin_req_host=origin_req_host,
unverifiable=True)
else:
raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
@@ -159,23 +325,56 @@ class GenericIE(InfoExtractor):
raise ExtractorError('Invalid URL protocol')
return response
+ def _extract_rss(self, url, video_id, doc):
+ playlist_title = doc.find('./channel/title').text
+ playlist_desc_el = doc.find('./channel/description')
+ playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+
+ entries = [{
+ '_type': 'url',
+ 'url': e.find('link').text,
+ 'title': e.find('title').text,
+ } for e in doc.findall('./channel/item')]
+
+ return {
+ '_type': 'playlist',
+ 'id': url,
+ 'title': playlist_title,
+ 'description': playlist_desc,
+ 'entries': entries,
+ }
+
def _real_extract(self, url):
+ if url.startswith('//'):
+ return {
+ '_type': 'url',
+ 'url': self.http_scheme() + url,
+ }
+
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
default_search = self._downloader.params.get('default_search')
if default_search is None:
- default_search = 'auto'
+ default_search = 'auto_warning'
- if default_search == 'auto':
+ if default_search in ('auto', 'auto_warning'):
if '/' in url:
self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
return self.url_result('http://' + url)
else:
+ if default_search == 'auto_warning':
+ if re.match(r'^(?:url|URL)$', url):
+ raise ExtractorError(
+ 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
+ expected=True)
+ else:
+ self._downloader.report_warning(
+ 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url)
return self.url_result('ytsearch:' + url)
else:
assert ':' in default_search
return self.url_result(default_search + url)
- video_id = os.path.splitext(url.split('/')[-1])[0]
+ video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
self.to_screen('%s: Requesting header' % video_id)
@@ -219,6 +418,19 @@ class GenericIE(InfoExtractor):
self.report_extraction(video_id)
+ # Is it an RSS feed?
+ try:
+ doc = parse_xml(webpage)
+ if doc.tag == 'rss':
+ return self._extract_rss(url, video_id, doc)
+ except compat_xml_parse_error:
+ pass
+
+ # Sometimes embedded video player is hidden behind percent encoding
+ # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
+ # Unescaping the whole page allows to handle those cases in a generic way
+ webpage = compat_urllib_parse.unquote(webpage)
+
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
@@ -252,9 +464,9 @@ class GenericIE(InfoExtractor):
# Look for embedded (iframe) Vimeo player
mobj = re.search(
- r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
if mobj:
- player_url = unescapeHTML(mobj.group(1))
+ player_url = unescapeHTML(mobj.group('url'))
surl = smuggle_url(player_url, {'Referer': url})
return self.url_result(surl, 'Vimeo')
@@ -280,7 +492,7 @@ class GenericIE(InfoExtractor):
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
if matches:
- urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+ urlrs = [self.url_result(unescapeHTML(tuppl[1]))
for tuppl in matches]
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -306,6 +518,22 @@ class GenericIE(InfoExtractor):
if mobj:
return self.url_result(mobj.group(1), 'BlipTV')
+ # Look for embedded condenast player
+ matches = re.findall(
+ r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
+ webpage)
+ if matches:
+ return {
+ '_type': 'playlist',
+ 'entries': [{
+ '_type': 'url',
+ 'ie_key': 'CondeNast',
+ 'url': ma,
+ } for ma in matches],
+ 'title': video_title,
+ 'id': video_id,
+ }
+
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -320,12 +548,13 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'))
# Look for Ooyala videos
- mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
+ mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
- return OoyalaIE._build_url_result(mobj.group(1))
+ return OoyalaIE._build_url_result(mobj.group('ec'))
# Look for Aparat videos
- mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
+ mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
if mobj is not None:
return self.url_result(mobj.group(1), 'Aparat')
@@ -334,11 +563,18 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group(1), 'Mpora')
- # Look for embedded Novamov player
+ # Look for embedded NovaMov-based player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
+ r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
+ (?P<url>http://(?:(?:embed|www)\.)?
+ (?:novamov\.com|
+ nowvideo\.(?:ch|sx|eu|at|ag|co)|
+ videoweed\.(?:es|com)|
+ movshare\.(?:net|sx|ag)|
+ divxstage\.(?:eu|net|ch|co|at|ag))
+ /embed\.php.+?)\1''', webpage)
if mobj is not None:
- return self.url_result(mobj.group('url'), 'Novamov')
+ return self.url_result(mobj.group('url'))
# Look for embedded Facebook player
mobj = re.search(
@@ -346,58 +582,142 @@ class GenericIE(InfoExtractor):
if mobj is not None:
return self.url_result(mobj.group('url'), 'Facebook')
+ # Look for embedded VK player
+ mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'VK')
+
# Look for embedded Huffington Post player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'HuffPost')
+ # Look for embed.ly
+ mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+ mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
+ if mobj is not None:
+ return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+
+ # Look for funnyordie embed
+ matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
+ if matches:
+ urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
+ for eurl in matches]
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
+
+ # Look for embedded RUTV player
+ rutv_url = RUTVIE._extract_url(webpage)
+ if rutv_url:
+ return self.url_result(rutv_url, 'RUTV')
+
+ # Look for embedded TED player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'TED')
+
+ # Look for embedded Ustream videos
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Ustream')
+
+ # Look for embedded arte.tv player
+ mobj = re.search(
+ r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+
+ # Look for embedded smotri.com player
+ smotri_url = SmotriIE._extract_url(webpage)
+ if smotri_url:
+ return self.url_result(smotri_url, 'Smotri')
+
+ # Look for embeded soundcloud player
+ mobj = re.search(
+ r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
+ webpage)
+ if mobj is not None:
+ url = unescapeHTML(mobj.group('url'))
+ return self.url_result(url)
+
# Start with something easy: JW Player in SWFObject
- mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
- if mobj is None:
+ found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
+ if not found:
# Look for gorilla-vid style embedding
- mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage)
- if mobj is None:
+ found = re.findall(r'''(?sx)
+ (?:
+ jw_plugins|
+ JWPlayerOptions|
+ jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
+ )
+ .*?file\s*:\s*["\'](.*?)["\']''', webpage)
+ if not found:
# Broaden the search a little bit
- mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
- if mobj is None:
- # Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
- if mobj is None:
+ found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+ if not found:
+ # Broaden the findall a little bit: JWPlayer JS loader
+ found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+ if not found:
# Try to find twitter cards info
- mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
- if mobj is None:
+ found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
+ if not found:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
- m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+ m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
- mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
- if mobj is None:
+ found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
+ if not found:
# HTML5 video
- mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
- if mobj is None:
+ found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+ if not found:
+ found = re.search(
+ r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+ r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+ webpage)
+ if found:
+ new_url = found.group(1)
+ self.report_following_redirect(new_url)
+ return {
+ '_type': 'url',
+ 'url': new_url,
+ }
+ if not found:
raise ExtractorError('Unsupported URL: %s' % url)
- # It's possible that one of the regexes
- # matched, but returned an empty group:
- if mobj.group(1) is None:
- raise ExtractorError('Did not find a valid video URL at %s' % url)
+ entries = []
+ for video_url in found:
+ video_url = compat_urlparse.urljoin(url, video_url)
+ video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
- video_url = mobj.group(1)
- video_url = compat_urlparse.urljoin(url, video_url)
- video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ # Sometimes, jwplayer extraction will result in a YouTube URL
+ if YoutubeIE.suitable(video_url):
+ entries.append(self.url_result(video_url, 'Youtube'))
+ continue
- # Sometimes, jwplayer extraction will result in a YouTube URL
- if YoutubeIE.suitable(video_url):
- return self.url_result(video_url, 'Youtube')
+ # here's a fun little line of code for you:
+ video_id = os.path.splitext(video_id)[0]
+
+ entries.append({
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'title': video_title,
+ })
- # here's a fun little line of code for you:
- video_id = os.path.splitext(video_id)[0]
+ if len(entries) == 1:
+ return entries[0]
+ else:
+ for num, e in enumerate(entries, start=1):
+ e['title'] = '%s (%d)' % (e['title'], num)
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ }
- return {
- 'id': video_id,
- 'url': video_url,
- 'uploader': video_uploader,
- 'title': video_title,
- }
diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py
index 5c25642..383032d 100644
--- a/youtube_dl/extractor/googlesearch.py
+++ b/youtube_dl/extractor/googlesearch.py
@@ -46,6 +46,6 @@ class GoogleSearchIE(SearchInfoExtractor):
'url': mobj.group(1)
})
- if (len(entries) >= n) or not re.search(r'class="pn" id="pnnext"', webpage):
+ if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
res['entries'] = entries[:n]
return res
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py
new file mode 100644
index 0000000..63d87b7
--- /dev/null
+++ b/youtube_dl/extractor/hentaistigma.py
@@ -0,0 +1,42 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class HentaiStigmaIE(InfoExtractor):
+ _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/',
+ 'md5': '4e3d07422a68a4cc363d8f57c8bf0d23',
+ 'info_dict': {
+ 'id': 'inyouchuu-etsu-bonus',
+ 'ext': 'mp4',
+ "title": "Inyouchuu Etsu Bonus",
+ "age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(
+ r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+ webpage, 'title')
+ wrap_url = self._html_search_regex(
+ r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+ wrap_webpage = self._download_webpage(wrap_url, video_id)
+
+ video_url = self._html_search_regex(
+ r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py
index 0d1ea68..94e7cf7 100644
--- a/youtube_dl/extractor/huffpost.py
+++ b/youtube_dl/extractor/huffpost.py
@@ -21,9 +21,10 @@ class HuffPostIE(InfoExtractor):
_TEST = {
'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
- 'file': '52dd3e4b02a7602131000677.mp4',
'md5': '55f5e8981c1c80a64706a44b74833de8',
'info_dict': {
+ 'id': '52dd3e4b02a7602131000677',
+ 'ext': 'mp4',
'title': 'Legalese It! with @MikeSacksHP',
'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
'duration': 1549,
diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/iconosquare.py
index d602e81..1d5a10a 100644
--- a/youtube_dl/extractor/statigram.py
+++ b/youtube_dl/extractor/iconosquare.py
@@ -5,8 +5,8 @@ import re
from .common import InfoExtractor
-class StatigramIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.)?statigr\.am/p/(?P<id>[^/]+)'
+class IconosquareIE(InfoExtractor):
+ _VALID_URL = r'https?://(www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
_TEST = {
'url': 'http://statigr.am/p/522207370455279102_24101272',
'md5': '6eb93b882a3ded7c378ee1d6884b1814',
@@ -15,6 +15,7 @@ class StatigramIE(InfoExtractor):
'ext': 'mp4',
'uploader_id': 'aguynamedpatrick',
'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
+ 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
},
}
@@ -25,7 +26,7 @@ class StatigramIE(InfoExtractor):
html_title = self._html_search_regex(
r'<title>(.+?)</title>',
webpage, 'title')
- title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title)
+ title = re.sub(r'(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)$', '', html_title)
uploader_id = self._html_search_regex(
r'@([^ ]+)', title, 'uploader name', fatal=False)
@@ -33,6 +34,7 @@ class StatigramIE(InfoExtractor):
'id': video_id,
'url': self._og_search_video_url(webpage),
'title': title,
+ 'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id': uploader_id
}
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index 381af91..1f42c6d 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -1,10 +1,8 @@
+from __future__ import unicode_literals
+
import re
-import json
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
-)
class IGNIE(InfoExtractor):
@@ -14,52 +12,57 @@ class IGNIE(InfoExtractor):
"""
_VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)'
- IE_NAME = u'ign.com'
+ IE_NAME = 'ign.com'
_CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
- _DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>',
- r'id="my_show_video">.*?<p>(.*?)</p>',
- ]
+ _DESCRIPTION_RE = [
+ r'<span class="page-object-description">(.+?)</span>',
+ r'id="my_show_video">.*?<p>(.*?)</p>',
+ ]
_TESTS = [
{
- u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
- u'file': u'8f862beef863986b2785559b9e1aa599.mp4',
- u'md5': u'eac8bdc1890980122c3b66f14bdd02e9',
- u'info_dict': {
- u'title': u'The Last of Us Review',
- u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c',
+ 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+ 'md5': 'eac8bdc1890980122c3b66f14bdd02e9',
+ 'info_dict': {
+ 'id': '8f862beef863986b2785559b9e1aa599',
+ 'ext': 'mp4',
+ 'title': 'The Last of Us Review',
+ 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
}
},
{
- u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
- u'playlist': [
+ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
+ 'playlist': [
{
- u'file': u'5ebbd138523268b93c9141af17bec937.mp4',
- u'info_dict': {
- u'title': u'GTA 5 Video Review',
- u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
+ 'info_dict': {
+ 'id': '5ebbd138523268b93c9141af17bec937',
+ 'ext': 'mp4',
+ 'title': 'GTA 5 Video Review',
+ 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
},
},
{
- u'file': u'638672ee848ae4ff108df2a296418ee2.mp4',
- u'info_dict': {
- u'title': u'26 Twisted Moments from GTA 5 in Slow Motion',
- u'description': u'The twisted beauty of GTA 5 in stunning slow motion.',
+ 'info_dict': {
+ 'id': '638672ee848ae4ff108df2a296418ee2',
+ 'ext': 'mp4',
+ 'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+ 'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
},
},
],
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
},
},
]
def _find_video_id(self, webpage):
- res_id = [r'data-video-id="(.+?)"',
- r'<object id="vid_(.+?)"',
- r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
- ]
+ res_id = [
+ r'data-video-id="(.+?)"',
+ r'<object id="vid_(.+?)"',
+ r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+ ]
return self._search_regex(res_id, webpage, 'video id')
def _real_extract(self, url):
@@ -68,7 +71,7 @@ class IGNIE(InfoExtractor):
page_type = mobj.group('type')
webpage = self._download_webpage(url, name_or_id)
if page_type == 'articles':
- video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url')
+ video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, 'video url')
return self.url_result(video_url, ie='IGN')
elif page_type != 'video':
multiple_urls = re.findall(
@@ -80,50 +83,42 @@ class IGNIE(InfoExtractor):
video_id = self._find_video_id(webpage)
result = self._get_video_info(video_id)
description = self._html_search_regex(self._DESCRIPTION_RE,
- webpage, 'video description',
- flags=re.DOTALL)
+ webpage, 'video description', flags=re.DOTALL)
result['description'] = description
return result
def _get_video_info(self, video_id):
config_url = self._CONFIG_URL_TEMPLATE % video_id
- config = json.loads(self._download_webpage(config_url, video_id,
- u'Downloading video info'))
+ config = self._download_json(config_url, video_id)
media = config['playlist']['media']
- video_url = media['url']
- return {'id': media['metadata']['videoId'],
- 'url': video_url,
- 'ext': determine_ext(video_url),
- 'title': media['metadata']['title'],
- 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
- }
+ return {
+ 'id': media['metadata']['videoId'],
+ 'url': media['url'],
+ 'title': media['metadata']['title'],
+ 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
+ }
class OneUPIE(IGNIE):
- """Extractor for 1up.com, it uses the ign videos system."""
-
_VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
IE_NAME = '1up.com'
_DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
- _TEST = {
- u'url': u'http://gamevideos.1up.com/video/id/34976',
- u'file': u'34976.mp4',
- u'md5': u'68a54ce4ebc772e4b71e3123d413163d',
- u'info_dict': {
- u'title': u'Sniper Elite V2 - Trailer',
- u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf',
+ _TESTS = [{
+ 'url': 'http://gamevideos.1up.com/video/id/34976',
+ 'md5': '68a54ce4ebc772e4b71e3123d413163d',
+ 'info_dict': {
+ 'id': '34976',
+ 'ext': 'mp4',
+ 'title': 'Sniper Elite V2 - Trailer',
+ 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf',
}
- }
-
- # Override IGN tests
- _TESTS = []
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- id = mobj.group('name_or_id')
result = super(OneUPIE, self)._real_extract(url)
- result['id'] = id
+ result['id'] = mobj.group('name_or_id')
return result
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index ed32373..e76dd22 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -11,16 +11,15 @@ from ..utils import (
class InfoQIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
+
_TEST = {
- "name": "InfoQ",
- "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
- "file": "12-jan-pythonthings.mp4",
- "info_dict": {
- "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.",
- "title": "A Few of My Favorite [Python] Things",
- },
- "params": {
- "skip_download": True,
+ 'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
+ 'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
+ 'info_dict': {
+ 'id': '12-jan-pythonthings',
+ 'ext': 'mp4',
+ 'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
+ 'title': 'A Few of My Favorite [Python] Things',
},
}
@@ -30,26 +29,39 @@ class InfoQIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
+ video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+ video_description = self._html_search_meta('description', webpage, 'description')
+
+ # The server URL is hardcoded
+ video_url = 'rtmpe://video.infoq.com/cfx/st/'
+
# Extract video URL
- encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id')
+ encoded_id = self._search_regex(
+ r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id')
real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
- video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
+ playpath = 'mp4:' + real_id
- # Extract title
- video_title = self._search_regex(r'contentTitle = "(.*?)";',
- webpage, 'title')
+ video_filename = playpath.split('/')[-1]
+ video_id, extension = video_filename.split('.')
- # Extract description
- video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
- webpage, 'description', fatal=False)
+ http_base = self._search_regex(
+ r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+ 'HTTP base URL')
- video_filename = video_url.split('/')[-1]
- video_id, extension = video_filename.split('.')
+ formats = [{
+ 'format_id': 'rtmp',
+ 'url': video_url,
+ 'ext': extension,
+ 'play_path': playpath,
+ }, {
+ 'format_id': 'http',
+ 'url': http_base + real_id,
+ }]
+ self._sort_formats(formats)
return {
'id': video_id,
- 'url': video_url,
'title': video_title,
- 'ext': extension, # Extension is always(?) mp4, but seems to be flv
'description': video_description,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 63141af..b5372bf 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -3,6 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+)
class InstagramIE(InfoExtractor):
@@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor):
'uploader_id': uploader_id,
'description': desc,
}
+
+
+class InstagramUserIE(InfoExtractor):
+ _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+ IE_DESC = 'Instagram user profile'
+ IE_NAME = 'instagram:user'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ uploader_id = mobj.group('username')
+
+ entries = []
+ page_count = 0
+ media_url = 'http://instagram.com/%s/media' % uploader_id
+ while True:
+ page = self._download_json(
+ media_url, uploader_id,
+ note='Downloading page %d ' % (page_count + 1),
+ )
+ page_count += 1
+
+ for it in page['items']:
+ if it.get('type') != 'video':
+ continue
+ like_count = int_or_none(it.get('likes', {}).get('count'))
+ user = it.get('user', {})
+
+ formats = [{
+ 'format_id': k,
+ 'height': v.get('height'),
+ 'width': v.get('width'),
+ 'url': v['url'],
+ } for k, v in it['videos'].items()]
+ self._sort_formats(formats)
+
+ thumbnails_el = it.get('images', {})
+ thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
+
+ title = it.get('caption', {}).get('text', it['id'])
+
+ entries.append({
+ 'id': it['id'],
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'webpage_url': it.get('link'),
+ 'uploader': user.get('full_name'),
+ 'uploader_id': user.get('username'),
+ 'like_count': like_count,
+ 'timestamp': int_or_none(it.get('created_time')),
+ })
+
+ if not page['items']:
+ break
+ max_id = page['items'][-1]['id']
+ media_url = (
+ 'http://instagram.com/%s/media?max_id=%s' % (
+ uploader_id, max_id))
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': uploader_id,
+ 'title': uploader_id,
+ }
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index dde4829..d1defd3 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -6,11 +6,14 @@ from random import random
from math import floor
from .common import InfoExtractor
-from ..utils import compat_urllib_request
+from ..utils import (
+ compat_urllib_request,
+ ExtractorError,
+)
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)'
+ _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://play.iprima.cz/particka/particka-92',
@@ -22,20 +25,37 @@ class IPrimaIE(InfoExtractor):
'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
},
'params': {
- 'skip_download': True,
+ 'skip_download': True, # requires rtmpdump
},
- },
- ]
+ }, {
+ 'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda',
+ 'info_dict': {
+ 'id': '9718337',
+ 'ext': 'flv',
+ 'title': 'Tchibo Partička - Jarní móda',
+ 'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True, # requires rtmpdump
+ },
+ 'skip': 'Do not have permission to access this page',
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (
- floor(random()*1073741824),
- floor(random()*1073741824))
+ if re.search(r'Nemáte oprávnění přistupovat na tuto stránku\.\s*</div>', webpage):
+ raise ExtractorError(
+ '%s said: You do not have permission to access this page' % self.IE_NAME, expected=True)
+
+ player_url = (
+ 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' %
+ (floor(random()*1073741824), floor(random()*1073741824))
+ )
req = compat_urllib_request.Request(player_url)
req.add_header('Referer', url)
@@ -44,18 +64,20 @@ class IPrimaIE(InfoExtractor):
base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1])
zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO')
-
if zoneGEO != '0':
- base_url = base_url.replace('token', 'token_'+zoneGEO)
+ base_url = base_url.replace('token', 'token_' + zoneGEO)
formats = []
for format_id in ['lq', 'hq', 'hd']:
- filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename')
+ filename = self._html_search_regex(
+ r'"%s_id":(.+?),' % format_id, webpage, 'filename')
if filename == 'null':
continue
- real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id')
+ real_id = self._search_regex(
+ r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]',
+ filename, 'real video id')
if format_id == 'lq':
quality = 0
@@ -63,13 +85,13 @@ class IPrimaIE(InfoExtractor):
quality = 1
elif format_id == 'hd':
quality = 2
- filename = 'hq/'+filename
+ filename = 'hq/' + filename
formats.append({
'format_id': format_id,
'url': base_url,
'quality': quality,
- 'play_path': 'mp4:'+filename.replace('"', '')[:-4],
+ 'play_path': 'mp4:' + filename.replace('"', '')[:-4],
'rtmp_live': True,
'ext': 'flv',
})
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 1ba4966..528be15 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -33,14 +33,14 @@ class IviIE(InfoExtractor):
},
# Serial's serie
{
- 'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791',
- 'md5': '3e6cc9a848c1d2ebcc6476444967baa9',
+ 'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549',
+ 'md5': '221f56b35e3ed815fde2df71032f4b3e',
'info_dict': {
- 'id': '74791',
+ 'id': '9549',
'ext': 'mp4',
- 'title': 'Дежурный ангел - 1 серия',
- 'duration': 2490,
- 'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',
+ 'title': 'Двое из ларца - Серия 1',
+ 'duration': 2655,
+ 'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg',
},
'skip': 'Only works from Russia',
}
diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py
index 592c64e..9b553b9 100644
--- a/youtube_dl/extractor/jukebox.py
+++ b/youtube_dl/extractor/jukebox.py
@@ -1,56 +1,61 @@
-# coding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
+ RegexNotFoundError,
unescapeHTML,
)
+
class JukeboxIE(InfoExtractor):
_VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
- _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
- _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
- _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'
- _IS_YOUTUBE = r'config":{"file":"(?P<youtube_url>http:[\\][/][\\][/]www[.]youtube[.]com[\\][/]watch[?]v=[^"]+)"'
+ _TEST = {
+ 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
+ 'md5': '1574e9b4d6438446d5b7dbcdf2786276',
+ 'info_dict': {
+ 'id': 'r303r',
+ 'ext': 'flv',
+ 'title': 'Kosheen-En Vivo Pride',
+ 'uploader': 'Kosheen',
+ },
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
html = self._download_webpage(url, video_id)
-
- mobj = re.search(self._IFRAME, html)
- if mobj is None:
- raise ExtractorError(u'Cannot extract iframe url')
- iframe_url = unescapeHTML(mobj.group('iframe'))
+ iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
- mobj = re.search(r'class="jkb_waiting"', iframe_html)
- if mobj is not None:
- raise ExtractorError(u'Video is not available(in your country?)!')
+ if re.search(r'class="jkb_waiting"', iframe_html) is not None:
+ raise ExtractorError('Video is not available(in your country?)!')
self.report_extraction(video_id)
- mobj = re.search(self._VIDEO_URL, iframe_html)
- if mobj is None:
- mobj = re.search(self._IS_YOUTUBE, iframe_html)
- if mobj is None:
- raise ExtractorError(u'Cannot extract video url')
- youtube_url = unescapeHTML(mobj.group('youtube_url')).replace('\/','/')
- self.to_screen(u'Youtube video detected')
- return self.url_result(youtube_url,ie='Youtube')
- video_url = unescapeHTML(mobj.group('video_url')).replace('\/','/')
- video_ext = unescapeHTML(mobj.group('video_ext'))
-
- mobj = re.search(self._TITLE, html)
- if mobj is None:
- raise ExtractorError(u'Cannot extract title')
- title = unescapeHTML(mobj.group('title'))
- artist = unescapeHTML(mobj.group('artist'))
-
- return [{'id': video_id,
- 'url': video_url,
- 'title': artist + '-' + title,
- 'ext': video_ext
- }]
+ try:
+ video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
+ iframe_html, 'video url')
+ video_url = unescapeHTML(video_url).replace('\/', '/')
+ except RegexNotFoundError:
+ youtube_url = self._search_regex(
+ r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
+ iframe_html, 'youtube url')
+ youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
+ self.to_screen('Youtube video detected')
+ return self.url_result(youtube_url, ie='Youtube')
+
+ title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
+ html, 'title')
+ artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
+ html, 'artist')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': artist + '-' + title,
+ 'uploader': artist,
+ }
diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py
index e9bde0c..7083db1 100644
--- a/youtube_dl/extractor/justintv.py
+++ b/youtube_dl/extractor/justintv.py
@@ -1,9 +1,12 @@
+from __future__ import unicode_literals
+
import json
import os
import re
from .common import InfoExtractor
from ..utils import (
+ compat_str,
ExtractorError,
formatSeconds,
)
@@ -24,34 +27,31 @@ class JustinTVIE(InfoExtractor):
/?(?:\#.*)?$
"""
_JUSTIN_PAGE_LIMIT = 100
- IE_NAME = u'justin.tv'
+ IE_NAME = 'justin.tv'
+ IE_DESC = 'justin.tv and twitch.tv'
_TEST = {
- u'url': u'http://www.twitch.tv/thegamedevhub/b/296128360',
- u'file': u'296128360.flv',
- u'md5': u'ecaa8a790c22a40770901460af191c9a',
- u'info_dict': {
- u"upload_date": u"20110927",
- u"uploader_id": 25114803,
- u"uploader": u"thegamedevhub",
- u"title": u"Beginner Series - Scripting With Python Pt.1"
+ 'url': 'http://www.twitch.tv/thegamedevhub/b/296128360',
+ 'md5': 'ecaa8a790c22a40770901460af191c9a',
+ 'info_dict': {
+ 'id': '296128360',
+ 'ext': 'flv',
+ 'upload_date': '20110927',
+ 'uploader_id': 25114803,
+ 'uploader': 'thegamedevhub',
+ 'title': 'Beginner Series - Scripting With Python Pt.1'
}
}
- def report_download_page(self, channel, offset):
- """Report attempt to download a single page of videos."""
- self.to_screen(u'%s: Downloading video information from %d to %d' %
- (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
-
# Return count of items, list of *valid* items
def _parse_page(self, url, video_id):
info_json = self._download_webpage(url, video_id,
- u'Downloading video info JSON',
- u'unable to download video info JSON')
+ 'Downloading video info JSON',
+ 'unable to download video info JSON')
response = json.loads(info_json)
if type(response) != list:
error_text = response.get('error', 'unknown error')
- raise ExtractorError(u'Justin.tv API: %s' % error_text)
+ raise ExtractorError('Justin.tv API: %s' % error_text)
info = []
for clip in response:
video_url = clip['video_file_url']
@@ -62,7 +62,7 @@ class JustinTVIE(InfoExtractor):
video_id = clip['id']
video_title = clip.get('title', video_id)
info.append({
- 'id': video_id,
+ 'id': compat_str(video_id),
'url': video_url,
'title': video_title,
'uploader': clip.get('channel_name', video_uploader_id),
@@ -74,8 +74,6 @@ class JustinTVIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'invalid URL: %s' % url)
api_base = 'http://api.justin.tv'
paged = False
@@ -89,40 +87,41 @@ class JustinTVIE(InfoExtractor):
webpage = self._download_webpage(url, chapter_id)
m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
if not m:
- raise ExtractorError(u'Cannot find archive of a chapter')
+ raise ExtractorError('Cannot find archive of a chapter')
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
- doc = self._download_xml(api, chapter_id,
- note=u'Downloading chapter information',
- errnote=u'Chapter information download failed')
+ doc = self._download_xml(
+ api, chapter_id,
+ note='Downloading chapter information',
+ errnote='Chapter information download failed')
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break
else:
- raise ExtractorError(u'Could not find chapter in chapter information')
+ raise ExtractorError('Could not find chapter in chapter information')
video_url = a.find('./video_file_url').text
- video_ext = video_url.rpartition('.')[2] or u'flv'
+ video_ext = video_url.rpartition('.')[2] or 'flv'
- chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
- chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
- note='Downloading chapter metadata',
- errnote='Download of chapter metadata failed')
- chapter_info = json.loads(chapter_info_json)
+ chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
+ chapter_info = self._download_json(
+ chapter_api_url, 'c' + chapter_id,
+ note='Downloading chapter metadata',
+ errnote='Download of chapter metadata failed')
bracket_start = int(doc.find('.//bracket_start').text)
bracket_end = int(doc.find('.//bracket_end').text)
# TODO determine start (and probably fix up file)
# youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
- #video_url += u'?start=' + TODO:start_timestamp
+ #video_url += '?start=' + TODO:start_timestamp
# bracket_start is 13290, but we want 51670615
- self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
- u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
+ self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
+ 'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
info = {
- 'id': u'c' + chapter_id,
+ 'id': 'c' + chapter_id,
'url': video_url,
'ext': video_ext,
'title': chapter_info['title'],
@@ -131,14 +130,12 @@ class JustinTVIE(InfoExtractor):
'uploader': chapter_info['channel']['display_name'],
'uploader_id': chapter_info['channel']['name'],
}
- return [info]
+ return info
else:
video_id = mobj.group('videoid')
api = api_base + '/broadcast/by_archive/%s.json' % video_id
- self.report_extraction(video_id)
-
- info = []
+ entries = []
offset = 0
limit = self._JUSTIN_PAGE_LIMIT
while True:
@@ -146,8 +143,12 @@ class JustinTVIE(InfoExtractor):
self.report_download_page(video_id, offset)
page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
page_count, page_info = self._parse_page(page_url, video_id)
- info.extend(page_info)
+ entries.extend(page_info)
if not paged or page_count != limit:
break
offset += limit
- return info
+ return {
+ '_type': 'playlist',
+ 'id': video_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py
index 29658a7..75b63cf 100644
--- a/youtube_dl/extractor/keezmovies.py
+++ b/youtube_dl/extractor/keezmovies.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import os
import re
@@ -11,22 +13,22 @@ from ..aes import (
aes_decrypt_text
)
+
class KeezMoviesIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+ _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
_TEST = {
- u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
- u'file': u'1214711.mp4',
- u'md5': u'6e297b7e789329923fcf83abb67c9289',
- u'info_dict': {
- u"title": u"Petite Asian Lady Mai Playing In Bathtub",
- u"age_limit": 18,
+ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
+ 'file': '1214711.mp4',
+ 'md5': '6e297b7e789329923fcf83abb67c9289',
+ 'info_dict': {
+ 'title': 'Petite Asian Lady Mai Playing In Bathtub',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor):
embedded_url = mobj.group(1)
return self.url_result(embedded_url)
- video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title')
- video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
- if webpage.find('encrypted=true')!=-1:
- password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password')
+ video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title')
+ video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, 'video_url'))
+ if 'encrypted=true' in webpage:
+ password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, 'password')
video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
path = compat_urllib_parse_urlparse(video_url).path
extension = os.path.splitext(path)[1][1:]
diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py
index 50bc883..961dd1a 100644
--- a/youtube_dl/extractor/kickstarter.py
+++ b/youtube_dl/extractor/kickstarter.py
@@ -1,37 +1,39 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
class KickStarterIE(InfoExtractor):
- _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*'
+ _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*'
_TEST = {
- u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location",
- u"file": u"1404461844.mp4",
- u"md5": u"c81addca81327ffa66c642b5d8b08cab",
- u"info_dict": {
- u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling",
+ 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location',
+ 'md5': 'c81addca81327ffa66c642b5d8b08cab',
+ 'info_dict': {
+ 'id': '1404461844',
+ 'ext': 'mp4',
+ 'title': 'Intersection: The Story of Josh Grant by Kyle Cowling',
+ 'description': 'A unique motocross documentary that examines the '
+ 'life and mind of one of sports most elite athletes: Josh Grant.',
},
}
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
- webpage_src = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'data-video="(.*?)">',
- webpage_src, u'video URL')
- if 'mp4' in video_url:
- ext = 'mp4'
- else:
- ext = 'flv'
- video_title = self._html_search_regex(r"<title>(.*?)</title>",
- webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
+ video_url = self._search_regex(r'data-video-url="(.*?)"',
+ webpage, 'video URL')
+ video_title = self._html_search_regex(r'<title>(.*?)</title>',
+ webpage, 'title').rpartition('— Kickstarter')[0].strip()
- results = [{
- 'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'ext': ext,
- }]
- return results
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index 1b45b67..5341ac7 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import int_or_none
class KontrTubeIE(InfoExtractor):
@@ -32,27 +33,26 @@ class KontrTubeIE(InfoExtractor):
video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
- title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
- 'video title')
+ title = self._html_search_regex(
+ r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title')
description = self._html_search_meta('description', webpage, 'video description')
- mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
- webpage)
+ mobj = re.search(
+ r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage)
duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
- view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
- 'view count', fatal=False)
- view_count = int(view_count) if view_count is not None else None
+ view_count = self._html_search_regex(
+ r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False)
comment_count = None
- comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
- fatal=False)
+ comment_str = self._html_search_regex(
+ r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False)
if comment_str.startswith('комментариев нет'):
comment_count = 0
else:
mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
if mobj:
- comment_count = int(mobj.group('total'))
+ comment_count = mobj.group('total')
return {
'id': video_id,
@@ -61,6 +61,6 @@ class KontrTubeIE(InfoExtractor):
'title': title,
'description': description,
'duration': duration,
- 'view_count': view_count,
- 'comment_count': comment_count,
+ 'view_count': int_or_none(view_count),
+ 'comment_count': int_or_none(comment_count),
} \ No newline at end of file
diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py
new file mode 100644
index 0000000..484239b
--- /dev/null
+++ b/youtube_dl/extractor/ku6.py
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class Ku6IE(InfoExtractor):
+ _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html'
+ _TEST = {
+ 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html',
+ 'md5': '01203549b9efbb45f4b87d55bdea1ed1',
+ 'info_dict': {
+ 'id': 'JG-8yS14xzBr4bCn1pu0xw',
+ 'ext': 'f4v',
+ 'title': 'techniques test',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._search_regex(r'<h1 title=.*>(.*?)</h1>', webpage, 'title')
+ dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id
+ jsonData = self._download_json(dataUrl, video_id)
+ downloadUrl = jsonData['data']['f']
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': downloadUrl
+ }
+
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index 7b7185f..7a431a2 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -6,7 +6,8 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
- unified_strdate
+ unified_strdate,
+ ExtractorError,
)
@@ -32,13 +33,11 @@ class LifeNewsIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
+ webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
- video_url = self._html_search_regex(
- r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
-
- thumbnail = self._html_search_regex(
- r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
+ videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
+ if not videos:
+ raise ExtractorError('No media links available for %s' % video_id)
title = self._og_search_title(webpage)
TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
@@ -50,20 +49,26 @@ class LifeNewsIE(InfoExtractor):
view_count = self._html_search_regex(
r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
comment_count = self._html_search_regex(
- r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count', fatal=False)
+ r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex(
r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
if upload_date is not None:
upload_date = unified_strdate(upload_date)
- return {
- 'id': video_id,
- 'url': video_url,
- 'thumbnail': thumbnail,
- 'title': title,
- 'description': description,
- 'view_count': int_or_none(view_count),
- 'comment_count': int_or_none(comment_count),
- 'upload_date': upload_date,
- } \ No newline at end of file
+ def make_entry(video_id, media, video_number=None):
+ return {
+ 'id': video_id,
+ 'url': media[1],
+ 'thumbnail': media[0],
+ 'title': title if video_number is None else '%s-video%s' % (title, video_number),
+ 'description': description,
+ 'view_count': int_or_none(view_count),
+ 'comment_count': int_or_none(comment_count),
+ 'upload_date': upload_date,
+ }
+
+ if len(videos) == 1:
+ return make_entry(video_id, videos[0])
+ else:
+ return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)] \ No newline at end of file
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 0a700d6..8e50e8f 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -4,15 +4,17 @@ import json
import re
from .common import InfoExtractor
+from ..utils import int_or_none
class LiveLeakIE(InfoExtractor):
_VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
- 'file': '757_1364311680.mp4',
'md5': '0813c2430bea7a46bf13acf3406992f4',
'info_dict': {
+ 'id': '757_1364311680',
+ 'ext': 'mp4',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident'
@@ -20,25 +22,62 @@ class LiveLeakIE(InfoExtractor):
},
{
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
- 'file': 'f93_1390833151.mp4',
'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
'info_dict': {
+ 'id': 'f93_1390833151',
+ 'ext': 'mp4',
'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
'uploader': 'ARD_Stinkt',
'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
}
+ },
+ {
+ 'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
+ 'md5': '42c6d97d54f1db107958760788c5f48f',
+ 'info_dict': {
+ 'id': '4f7_1392687779',
+ 'ext': 'mp4',
+ 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.",
+ 'uploader': 'CapObveus',
+ 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
+ 'age_limit': 18,
+ }
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
video_id = mobj.group('video_id')
webpage = self._download_webpage(url, video_id)
+
+ video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
+ video_description = self._og_search_description(webpage)
+ video_uploader = self._html_search_regex(
+ r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
+ age_limit = int_or_none(self._search_regex(
+ r'you confirm that you are ([0-9]+) years and over.',
+ webpage, 'age limit', default=None))
+
sources_raw = self._search_regex(
r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
if sources_raw is None:
- sources_raw = '[{ %s}]' % (
- self._search_regex(r'(file: ".*?"),', webpage, 'video URL'))
+ alt_source = self._search_regex(
+ r'(file: ".*?"),', webpage, 'video URL', default=None)
+ if alt_source:
+ sources_raw = '[{ %s}]' % alt_source
+ else:
+ # Maybe an embed?
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"',
+ webpage, 'embed URL')
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ 'id': video_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'age_limit': age_limit,
+ }
sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
sources = json.loads(sources_json)
@@ -49,15 +88,11 @@ class LiveLeakIE(InfoExtractor):
} for s in sources]
self._sort_formats(formats)
- video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
- video_description = self._og_search_description(webpage)
- video_uploader = self._html_search_regex(
- r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
-
return {
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
'formats': formats,
+ 'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index 6deed27..33f34f4 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -8,7 +8,9 @@ from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
compat_urllib_request,
- ExtractorError
+ ExtractorError,
+ int_or_none,
+ compat_str,
)
@@ -19,16 +21,17 @@ class LyndaIE(SubtitlesInfoExtractor):
_LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
_NETRC_MACHINE = 'lynda'
- _SUCCESSFUL_LOGIN_REGEX = r'<a href="https://www.lynda.com/home/userAccount/ChangeContactInfo.aspx" data-qa="eyebrow_account_menu">My account'
+ _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
_TEST = {
'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
- 'file': '114408.mp4',
'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
'info_dict': {
+ 'id': '114408',
+ 'ext': 'mp4',
'title': 'Using the exercise files',
'duration': 68
}
@@ -41,27 +44,44 @@ class LyndaIE(SubtitlesInfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
- page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
- video_id, 'Downloading video JSON')
+ page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
+ 'Downloading video JSON')
video_json = json.loads(page)
if 'Status' in video_json:
raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
if video_json['HasAccess'] is False:
- raise ExtractorError('Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
+ raise ExtractorError(
+ 'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
- video_id = video_json['ID']
+ video_id = compat_str(video_json['ID'])
duration = video_json['DurationInSeconds']
title = video_json['Title']
- formats = [{'url': fmt['Url'],
+ formats = []
+
+ fmts = video_json.get('Formats')
+ if fmts:
+ formats.extend([
+ {
+ 'url': fmt['Url'],
'ext': fmt['Extension'],
'width': fmt['Width'],
'height': fmt['Height'],
'filesize': fmt['FileSize'],
'format_id': str(fmt['Resolution'])
- } for fmt in video_json['Formats']]
+ } for fmt in fmts])
+
+ prioritized_streams = video_json.get('PrioritizedStreams')
+ if prioritized_streams:
+ formats.extend([
+ {
+ 'url': video_url,
+ 'width': int_or_none(format_id),
+ 'format_id': format_id,
+ } for format_id, video_url in prioritized_streams['0'].items()
+ ])
self._sort_formats(formats)
@@ -91,7 +111,7 @@ class LyndaIE(SubtitlesInfoExtractor):
'stayPut': 'false'
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
- login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+ login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
# Not (yet) logged in
m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
@@ -116,7 +136,7 @@ class LyndaIE(SubtitlesInfoExtractor):
'stayPut': 'false',
}
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
- login_page = self._download_webpage(request, None, note='Confirming log in and log out from another device')
+ login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
raise ExtractorError('Unable to log in')
@@ -150,7 +170,7 @@ class LyndaIE(SubtitlesInfoExtractor):
def _get_available_subtitles(self, video_id, webpage):
url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
- sub = self._download_webpage(url, None, note=False)
+ sub = self._download_webpage(url, None, False)
sub_json = json.loads(sub)
return {'en': url} if len(sub_json) > 0 else {}
@@ -179,6 +199,9 @@ class LyndaCourseIE(InfoExtractor):
videos = []
(username, _) = self._get_login_info()
+ # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
+ # by single video API anymore
+
for chapter in course_json['Chapters']:
for video in chapter['Videos']:
if username is None and video['HasAccess'] is False:
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
new file mode 100644
index 0000000..7460d81
--- /dev/null
+++ b/youtube_dl/extractor/mailru.py
@@ -0,0 +1,86 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MailRuIE(InfoExtractor):
+ IE_NAME = 'mailru'
+ IE_DESC = 'Видео@Mail.Ru'
+ _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)'
+
+ _TESTS = [
+ {
+ 'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
+ 'md5': 'dea205f03120046894db4ebb6159879a',
+ 'info_dict': {
+ 'id': '46301138',
+ 'ext': 'mp4',
+ 'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
+ 'timestamp': 1393232740,
+ 'upload_date': '20140224',
+ 'uploader': 'sonypicturesrus',
+ 'uploader_id': 'sonypicturesrus@mail.ru',
+ 'duration': 184,
+ },
+ },
+ {
+ 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
+ 'md5': '00a91a58c3402204dcced523777b475f',
+ 'info_dict': {
+ 'id': '46843144',
+ 'ext': 'mp4',
+ 'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
+ 'timestamp': 1397217632,
+ 'upload_date': '20140411',
+ 'uploader': 'hitech',
+ 'uploader_id': 'hitech@corp.mail.ru',
+ 'duration': 245,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('idv1')
+
+ if not video_id:
+ video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')
+
+ video_data = self._download_json(
+ 'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
+
+ author = video_data['author']
+ uploader = author['name']
+ uploader_id = author['id']
+
+ movie = video_data['movie']
+ content_id = str(movie['contentId'])
+ title = movie['title']
+ if title.endswith('.mp4'):
+ title = title[:-4]
+ thumbnail = movie['poster']
+ duration = movie['duration']
+
+ view_count = video_data['views_count']
+
+ formats = [
+ {
+ 'url': video['url'],
+ 'format_id': video['name'],
+ } for video in video_data['videos']
+ ]
+
+ return {
+ 'id': content_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'timestamp': video_data['timestamp'],
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 7aa0080..1b8c4a3 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -1,15 +1,18 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
-)
class MDRIE(InfoExtractor):
- _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*'
+ _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
# No tests, MDR regularily deletes its videos
+ _TEST = {
+ 'url': 'http://www.mdr.de/fakt/video189002.html',
+ 'only_matching': True,
+ }
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
@@ -19,9 +22,9 @@ class MDRIE(InfoExtractor):
# determine title and media streams from webpage
html = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title')
+ title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
xmlurl = self._search_regex(
- r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL')
+ r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
doc = self._download_xml(domain + xmlurl, video_id)
formats = []
@@ -41,7 +44,7 @@ class MDRIE(InfoExtractor):
if vbr_el is None:
format.update({
'vcodec': 'none',
- 'format_id': u'%s-%d' % (media_type, abr),
+ 'format_id': '%s-%d' % (media_type, abr),
})
else:
vbr = int(vbr_el.text) // 1000
@@ -49,12 +52,9 @@ class MDRIE(InfoExtractor):
'vbr': vbr,
'width': int(a.find('frameWidth').text),
'height': int(a.find('frameHeight').text),
- 'format_id': u'%s-%d' % (media_type, vbr),
+ 'format_id': '%s-%d' % (media_type, vbr),
})
formats.append(format)
- if not formats:
- raise ExtractorError(u'Could not find any valid formats')
-
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 99d3c83..6436c05 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -9,104 +11,103 @@ from ..utils import (
ExtractorError,
)
-class MetacafeIE(InfoExtractor):
- """Information Extractor for metacafe.com."""
- _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+class MetacafeIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
- IE_NAME = u'metacafe'
+ IE_NAME = 'metacafe'
_TESTS = [
- # Youtube video
- {
- u"add_ie": ["Youtube"],
- u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
- u"file": u"_aUehQsCQtM.mp4",
- u"info_dict": {
- u"upload_date": u"20090102",
- u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!",
- u"description": u"md5:2439a8ef6d5a70e380c22f5ad323e5a8",
- u"uploader": u"PBS",
- u"uploader_id": u"PBS"
- }
- },
- # Normal metacafe video
- {
- u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
- u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad',
- u'info_dict': {
- u'id': u'11121940',
- u'ext': u'mp4',
- u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4',
- u'uploader': u'ign',
- u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+ # Youtube video
+ {
+ 'add_ie': ['Youtube'],
+ 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
+ 'info_dict': {
+ 'id': '_aUehQsCQtM',
+ 'ext': 'mp4',
+ 'upload_date': '20090102',
+ 'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
+ 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
+ 'uploader': 'PBS',
+ 'uploader_id': 'PBS'
+ }
},
- },
- # AnyClip video
- {
- u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
- u"file": u"an-dVVXnuY7Jh77J.mp4",
- u"info_dict": {
- u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
- u"uploader": u"anyclip",
- u"description": u"md5:38c711dd98f5bb87acf973d573442e67",
+ # Normal metacafe video
+ {
+ 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
+ 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
+ 'info_dict': {
+ 'id': '11121940',
+ 'ext': 'mp4',
+ 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
+ 'uploader': 'ign',
+ 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+ },
},
- },
- # age-restricted video
- {
- u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
- u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09',
- u'info_dict': {
- u'id': u'5186653',
- u'ext': u'mp4',
- u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
- u'uploader': u'Dwayne Pipe',
- u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b',
- u'age_limit': 18,
+ # AnyClip video
+ {
+ 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
+ 'info_dict': {
+ 'id': 'an-dVVXnuY7Jh77J',
+ 'ext': 'mp4',
+ 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
+ 'uploader': 'anyclip',
+ 'description': 'md5:38c711dd98f5bb87acf973d573442e67',
+ },
},
- },
- # cbs video
- {
- u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/',
- u'info_dict': {
- u'id': u'0rOxMBabDXN6',
- u'ext': u'flv',
- u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet',
- u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d',
- u'duration': 129,
+ # age-restricted video
+ {
+ 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
+ 'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
+ 'info_dict': {
+ 'id': '5186653',
+ 'ext': 'mp4',
+ 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
+ 'uploader': 'Dwayne Pipe',
+ 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
+ 'age_limit': 18,
+ },
},
- u'params': {
- # rtmp download
- u'skip_download': True,
+ # cbs video
+ {
+ 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
+ 'info_dict': {
+ 'id': '8VD4r_Zws8VP',
+ 'ext': 'flv',
+ 'title': 'Open: This is Face the Nation, February 9',
+ 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
+ 'duration': 96,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
},
- },
]
-
def report_disclaimer(self):
- """Report disclaimer retrieval."""
- self.to_screen(u'Retrieving disclaimer')
+ self.to_screen('Retrieving disclaimer')
def _real_initialize(self):
# Retrieve disclaimer
self.report_disclaimer()
- self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer')
+ self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
# Confirm age
disclaimer_form = {
'filters': '0',
'submit': "Continue - I'm over 18",
- }
+ }
request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.report_age_confirmation()
- self._download_webpage(request, None, False, u'Unable to confirm age')
+ self._download_webpage(request, None, False, 'Unable to confirm age')
def _real_extract(self, url):
# Extract id and simplified title from URL
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(1)
@@ -153,22 +154,24 @@ class MetacafeIE(InfoExtractor):
else:
mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
+ raise ExtractorError('Unable to extract media URL')
vardict = compat_parse_qs(mobj.group(1))
if 'mediaData' not in vardict:
- raise ExtractorError(u'Unable to extract media URL')
- mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
+ raise ExtractorError('Unable to extract media URL')
+ mobj = re.search(
+ r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
+ raise ExtractorError('Unable to extract media URL')
mediaURL = mobj.group('mediaURL').replace('\\/', '/')
video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
video_ext = determine_ext(video_url)
- video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
+ video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title')
description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(
r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
- webpage, u'uploader nickname', fatal=False)
+ webpage, 'uploader nickname', fatal=False)
if re.search(r'"contentRating":"restricted"', webpage) is not None:
age_limit = 18
@@ -176,13 +179,12 @@ class MetacafeIE(InfoExtractor):
age_limit = 0
return {
- '_type': 'video',
- 'id': video_id,
- 'url': video_url,
+ 'id': video_id,
+ 'url': video_url,
'description': description,
'uploader': video_uploader,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_ext,
+ 'title': video_title,
+ 'thumbnail':thumbnail,
+ 'ext': video_ext,
'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index 465ac49..07f0729 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -13,8 +13,9 @@ class MetacriticIE(InfoExtractor):
_TEST = {
'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
- 'file': '3698222.mp4',
'info_dict': {
+ 'id': '3698222',
+ 'ext': 'mp4',
'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
'duration': 221,
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 76b717f..807b1dc 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -1,24 +1,30 @@
+from __future__ import unicode_literals
+
import re
import json
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
+ compat_urlparse,
clean_html,
+ ExtractorError,
get_element_by_id,
)
class TechTVMITIE(InfoExtractor):
- IE_NAME = u'techtv.mit.edu'
+ IE_NAME = 'techtv.mit.edu'
_VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
_TEST = {
- u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
- u'file': u'25418.mp4',
- u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
- u'info_dict': {
- u'title': u'MIT DNA Learning Center Set',
- u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
+ 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+ 'md5': '1f8cb3e170d41fd74add04d3c9330e5f',
+ 'info_dict': {
+ 'id': '25418',
+ 'ext': 'mp4',
+ 'title': 'MIT DNA Learning Center Set',
+ 'description': 'md5:82313335e8a8a3f243351ba55bc1b474',
},
}
@@ -27,12 +33,12 @@ class TechTVMITIE(InfoExtractor):
video_id = mobj.group('id')
raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
- clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
+ clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
- base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
- raw_page, u'base url')
- formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
- u'video formats')
+ base_url = self._search_regex(
+ r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url')
+ formats_json = self._search_regex(
+ r'bitrates: (\[.+?\])', raw_page, 'video formats')
formats_mit = json.loads(formats_json)
formats = [
{
@@ -48,28 +54,31 @@ class TechTVMITIE(InfoExtractor):
title = get_element_by_id('edit-title', clean_page)
description = clean_html(get_element_by_id('edit-description', clean_page))
- thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
- raw_page, u'thumbnail', flags=re.DOTALL)
+ thumbnail = self._search_regex(
+ r'playlist:.*?url: \'(.+?)\'',
+ raw_page, 'thumbnail', flags=re.DOTALL)
- return {'id': video_id,
- 'title': title,
- 'formats': formats,
- 'description': description,
- 'thumbnail': thumbnail,
- }
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
class MITIE(TechTVMITIE):
- IE_NAME = u'video.mit.edu'
+ IE_NAME = 'video.mit.edu'
_VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
_TEST = {
- u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
- u'file': u'21783.mp4',
- u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
- u'info_dict': {
- u'title': u'The Government is Profiling You',
- u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
+ 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
+ 'md5': '7db01d5ccc1895fc5010e9c9e13648da',
+ 'info_dict': {
+ 'id': '21783',
+ 'ext': 'mp4',
+ 'title': 'The Government is Profiling You',
+ 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd',
},
}
@@ -77,7 +86,73 @@ class MITIE(TechTVMITIE):
mobj = re.match(self._VALID_URL, url)
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
- self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
- embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
- u'embed url')
+ embed_url = self._search_regex(
+ r'<iframe .*?src="(.+?)"', webpage, 'embed url')
return self.url_result(embed_url, ie='TechTVMIT')
+
+
+class OCWMITIE(InfoExtractor):
+ IE_NAME = 'ocw.mit.edu'
+ _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+ _BASE_URL = 'http://ocw.mit.edu/'
+
+ _TESTS = [
+ {
+ 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
+ 'info_dict': {
+ 'id': 'EObHWIEKGjA',
+ 'ext': 'mp4',
+ 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
+ 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
+ #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+ }
+ },
+ {
+ 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
+ 'info_dict': {
+ 'id': '7K1sB05pE0A',
+ 'ext': 'mp4',
+ 'title': 'Session 1: Introduction to Derivatives',
+ 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
+ #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+ }
+ }
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ topic = mobj.group('topic')
+
+ webpage = self._download_webpage(url, topic)
+ title = self._html_search_meta('WT.cg_s', webpage)
+ description = self._html_search_meta('Description', webpage)
+
+ # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
+ embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
+ if embed_chapter_media:
+ metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
+ metadata = re.split(r', ?', metadata)
+ yt = metadata[1]
+ subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
+ else:
+ # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
+ embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
+ if embed_media:
+ metadata = re.sub(r'[\'"]', '', embed_media.group(1))
+ metadata = re.split(r', ?', metadata)
+ yt = metadata[1]
+ subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
+ else:
+ raise ExtractorError('Unable to find embedded YouTube video.')
+ video_id = YoutubeIE.extract_id(yt)
+
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'url': yt,
+ 'url_transparent'
+ 'subtitles': subs,
+ 'ie_key': 'Youtube',
+ }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index f3356db..5f64e7b 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -4,24 +4,31 @@ import re
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
+ compat_urllib_parse,
ExtractorError,
+ int_or_none,
+ parse_iso8601,
)
class MixcloudIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
IE_NAME = 'mixcloud'
_TEST = {
'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
- 'file': 'dholbach-cryptkeeper.mp3',
'info_dict': {
+ 'id': 'dholbach-cryptkeeper',
+ 'ext': 'mp3',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
'uploader_id': 'dholbach',
'upload_date': '20111115',
+ 'timestamp': 1321359578,
+ 'thumbnail': 're:https?://.*\.jpg',
+ 'view_count': int,
+ 'like_count': int,
},
}
@@ -45,14 +52,10 @@ class MixcloudIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group(1)
cloudcast_name = mobj.group(2)
- track_id = '-'.join((uploader, cloudcast_name))
+ track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
webpage = self._download_webpage(url, track_id)
- api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)
- info = self._download_json(
- api_url, track_id, 'Downloading cloudcast info')
-
preview_url = self._search_regex(
r'\s(?:data-preview-url|m-preview)="(.+?)"', webpage, 'preview url')
song_url = preview_url.replace('/previews/', '/c/originals/')
@@ -63,16 +66,41 @@ class MixcloudIE(InfoExtractor):
template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
final_song_url = self._get_url(template_url)
if final_song_url is None:
- raise ExtractorError(u'Unable to extract track url')
+ raise ExtractorError('Unable to extract track url')
+
+ PREFIX = (
+ r'<div class="cloudcast-play-button-container"'
+ r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
+ title = self._html_search_regex(
+ PREFIX + r'm-title="([^"]+)"', webpage, 'title')
+ thumbnail = self._proto_relative_url(self._html_search_regex(
+ PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',
+ fatal=False))
+ uploader = self._html_search_regex(
+ PREFIX + r'm-owner-name="([^"]+)"',
+ webpage, 'uploader', fatal=False)
+ uploader_id = self._search_regex(
+ r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
+ description = self._og_search_description(webpage)
+ like_count = int_or_none(self._search_regex(
+ r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
+ webpage, 'like count', fatal=False))
+ view_count = int_or_none(self._search_regex(
+ r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
+ webpage, 'play count', fatal=False))
+ timestamp = parse_iso8601(self._search_regex(
+ r'<time itemprop="dateCreated" datetime="([^"]+)">',
+ webpage, 'upload date'))
return {
'id': track_id,
- 'title': info['name'],
+ 'title': title,
'url': final_song_url,
- 'description': info.get('description'),
- 'thumbnail': info['pictures'].get('extra_large'),
- 'uploader': info['user']['name'],
- 'uploader_id': info['user']['username'],
- 'upload_date': unified_strdate(info['created_time']),
- 'view_count': info['play_count'],
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'like_count': like_count,
}
diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py
index f1875ad..7d21ea1 100644
--- a/youtube_dl/extractor/mooshare.py
+++ b/youtube_dl/extractor/mooshare.py
@@ -14,7 +14,7 @@ from ..utils import (
class MooshareIE(InfoExtractor):
IE_NAME = 'mooshare'
IE_DESC = 'Mooshare.biz'
- _VALID_URL = r'http://mooshare\.biz/(?P<id>[\da-z]{12})'
+ _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})'
_TESTS = [
{
diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py
new file mode 100644
index 0000000..320d27b
--- /dev/null
+++ b/youtube_dl/extractor/morningstar.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MorningstarIE(InfoExtractor):
+ IE_DESC = 'morningstar.com'
+ _VALID_URL = r'https?://(?:www\.)?morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869',
+ 'md5': '6c0acface7a787aadc8391e4bbf7b0f5',
+ 'info_dict': {
+ 'id': '615869',
+ 'ext': 'mp4',
+ 'title': 'Get Ahead of the Curve on 2013 Taxes',
+ 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.",
+ 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title')
+ video_url = self._html_search_regex(
+ r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"',
+ webpage, 'video URL')
+ thumbnail = self._html_search_regex(
+ r'<input type="hidden" id="hidSnapshot" value="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'<div id="mstarDeck".*?>(.*?)</div>',
+ webpage, 'description', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
new file mode 100644
index 0000000..7c0ec6a
--- /dev/null
+++ b/youtube_dl/extractor/motorsport.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import json
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+ compat_str,
+ int_or_none,
+)
+
+
+class MotorsportIE(InfoExtractor):
+ IE_DESC = 'motorsport.com'
+ _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
+ _TEST = {
+ 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
+ 'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
+ 'info_dict': {
+ 'id': '7063',
+ 'ext': 'mp4',
+ 'title': 'Red Bull Racing: 2014 Rules Explained',
+ 'duration': 207,
+ 'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.',
+ 'uploader': 'rainiere',
+ 'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, display_id)
+ flashvars_code = self._html_search_regex(
+ r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
+ flashvars = compat_parse_qs(flashvars_code)
+ params = json.loads(flashvars['parameters'][0])
+
+ e = compat_str(int(time.time()) + 24 * 60 * 60)
+ base_video_url = params['location'] + '?e=' + e
+ s = 'h3hg713fh32'
+ h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest()
+ video_url = base_video_url + '&h=' + h
+
+ uploader = self._html_search_regex(
+ r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
+ 'uploader', fatal=False)
+
+ return {
+ 'id': params['video_id'],
+ 'display_id': display_id,
+ 'title': params['title'],
+ 'url': video_url,
+ 'description': params.get('description'),
+ 'thumbnail': params.get('main_thumb'),
+ 'duration': int_or_none(params.get('duration')),
+ 'uploader': uploader,
+ }
diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py
new file mode 100644
index 0000000..4314618
--- /dev/null
+++ b/youtube_dl/extractor/moviezine.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MoviezineIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.moviezine\.se/video/(?P<id>[^?#]+)'
+
+ _TEST = {
+ 'url': 'http://www.moviezine.se/video/205866',
+ 'info_dict': {
+ 'id': '205866',
+ 'ext': 'mp4',
+ 'title': 'Oculus - Trailer 1',
+ 'description': 'md5:40cc6790fc81d931850ca9249b40e8a4',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player')
+
+ formats =[{
+ 'format_id': 'sd',
+ 'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'),
+ 'quality': 0,
+ 'ext': 'mp4',
+ }]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'),
+ 'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'),
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ }
diff --git a/youtube_dl/extractor/movshare.py b/youtube_dl/extractor/movshare.py
new file mode 100644
index 0000000..4191cf7
--- /dev/null
+++ b/youtube_dl/extractor/movshare.py
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from .novamov import NovaMovIE
+
+
+class MovShareIE(NovaMovIE):
+ IE_NAME = 'movshare'
+ IE_DESC = 'MovShare'
+
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'movshare\.(?:net|sx|ag)'}
+
+ _HOST = 'www.movshare.net'
+
+ _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+ _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>'
+ _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>'
+
+ _TEST = {
+ 'url': 'http://www.movshare.net/video/559e28be54d96',
+ 'md5': 'abd31a2132947262c50429e1d16c1bfd',
+ 'info_dict': {
+ 'id': '559e28be54d96',
+ 'ext': 'flv',
+ 'title': 'dissapeared image',
+ 'description': 'optical illusion dissapeared image magic illusion',
+ }
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
index 6a8e2cc..39d6feb 100644
--- a/youtube_dl/extractor/mpora.py
+++ b/youtube_dl/extractor/mpora.py
@@ -4,9 +4,7 @@ import json
import re
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
-)
+from ..utils import int_or_none
class MporaIE(InfoExtractor):
@@ -20,7 +18,7 @@ class MporaIE(InfoExtractor):
'info_dict': {
'title': 'Katy Curd - Winter in the Forest',
'duration': 416,
- 'uploader': 'petenewman',
+ 'uploader': 'Peter Newman Media',
},
}
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 5447b6c..e5ca41b 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -5,9 +5,12 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
+ compat_urllib_request,
ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
+ HEADRequest,
+ unescapeHTML,
url_basename,
RegexNotFoundError,
)
@@ -18,6 +21,7 @@ def _media_xml_tag(tag):
class MTVServicesInfoExtractor(InfoExtractor):
+ _MOBILE_TEMPLATE = None
@staticmethod
def _id_from_uri(uri):
return uri.split(':')[-1]
@@ -39,9 +43,29 @@ class MTVServicesInfoExtractor(InfoExtractor):
else:
return thumb_node.attrib['url']
- def _extract_video_formats(self, mdoc):
- if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None:
- raise ExtractorError('This video is not available from your country.', expected=True)
+ def _extract_mobile_video_formats(self, mtvn_id):
+ webpage_url = self._MOBILE_TEMPLATE % mtvn_id
+ req = compat_urllib_request.Request(webpage_url)
+ # Otherwise we get a webpage that would execute some javascript
+ req.add_header('Youtubedl-user-agent', 'curl/7')
+ webpage = self._download_webpage(req, mtvn_id,
+ 'Downloading mobile page')
+ metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
+ req = HEADRequest(metrics_url)
+ response = self._request_webpage(req, mtvn_id, 'Resolving url')
+ url = response.geturl()
+ # Transform the url to get the best quality:
+ url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
+ return [{'url': url,'ext': 'mp4'}]
+
+ def _extract_video_formats(self, mdoc, mtvn_id):
+ if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None:
+ if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
+ self.to_screen('The normal version is not available from your '
+ 'country, trying with the mobile version')
+ return self._extract_mobile_video_formats(mtvn_id)
+ raise ExtractorError('This video is not available from your country.',
+ expected=True)
formats = []
for rendition in mdoc.findall('.//rendition'):
@@ -56,6 +80,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
})
except (KeyError, TypeError):
raise ExtractorError('Invalid rendition field.')
+ self._sort_formats(formats)
return formats
def _get_video_info(self, itemdoc):
@@ -94,9 +119,16 @@ class MTVServicesInfoExtractor(InfoExtractor):
raise ExtractorError('Could not find video title')
title = title.strip()
+ # This a short id that's used in the webpage urls
+ mtvn_id = None
+ mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
+ 'scheme', 'urn:mtvn:id')
+ if mtvn_id_node is not None:
+ mtvn_id = mtvn_id_node.text
+
return {
'title': title,
- 'formats': self._extract_video_formats(mediagen_doc),
+ 'formats': self._extract_video_formats(mediagen_doc, mtvn_id),
'id': video_id,
'thumbnail': self._get_thumbnail_url(uri, itemdoc),
'description': description,
diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py
new file mode 100644
index 0000000..42d7a82
--- /dev/null
+++ b/youtube_dl/extractor/musicplayon.py
@@ -0,0 +1,75 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MusicPlayOnIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://en.musicplayon.com/play?v=433377',
+ 'info_dict': {
+ 'id': '433377',
+ 'ext': 'mp4',
+ 'title': 'Rick Ross - Interview On Chelsea Lately (2014)',
+ 'description': 'Rick Ross Interview On Chelsea Lately',
+ 'duration': 342,
+ 'uploader': 'ultrafish',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(page)
+ description = self._og_search_description(page)
+ thumbnail = self._og_search_thumbnail(page)
+ duration = self._html_search_meta('video:duration', page, 'duration', fatal=False)
+ view_count = self._og_search_property('count', page, fatal=False)
+ uploader = self._html_search_regex(
+ r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
+
+ formats = [
+ {
+ 'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id,
+ 'ext': 'mp4',
+ }
+ ]
+
+ manifest = self._download_webpage(
+ 'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest')
+
+ for entry in manifest.split('#')[1:]:
+ if entry.startswith('EXT-X-STREAM-INF:'):
+ meta, url, _ = entry.split('\n')
+ params = dict(param.split('=') for param in meta.split(',')[1:])
+ formats.append({
+ 'url': url,
+ 'ext': 'mp4',
+ 'tbr': int(params['BANDWIDTH']),
+ 'width': int(params['RESOLUTION'].split('x')[1]),
+ 'height': int(params['RESOLUTION'].split('x')[-1]),
+ 'format_note': params['NAME'].replace('"', '').strip(),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'duration': int_or_none(duration),
+ 'view_count': int_or_none(view_count),
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 6d35c78..ccb5959 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import binascii
import base64
import hashlib
@@ -14,18 +16,16 @@ from ..utils import (
)
-
class MyVideoIE(InfoExtractor):
- """Information Extractor for myvideo.de."""
-
- _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*'
- IE_NAME = u'myvideo'
+ _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*'
+ IE_NAME = 'myvideo'
_TEST = {
- u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
- u'file': u'8229274.flv',
- u'md5': u'2d2753e8130479ba2cb7e0a37002053e',
- u'info_dict': {
- u"title": u"bowling-fail-or-win"
+ 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
+ 'md5': '2d2753e8130479ba2cb7e0a37002053e',
+ 'info_dict': {
+ 'id': '8229274',
+ 'ext': 'flv',
+ 'title': 'bowling-fail-or-win',
}
}
@@ -53,10 +53,7 @@ class MyVideoIE(InfoExtractor):
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'invalid URL: %s' % url)
-
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
GK = (
b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
@@ -74,37 +71,33 @@ class MyVideoIE(InfoExtractor):
video_url = mobj.group(1) + '.flv'
video_title = self._html_search_regex('<title>([^<]+)</title>',
- webpage, u'title')
-
- video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
+ webpage, 'title')
- return [{
- 'id': video_id,
- 'url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_ext,
- }]
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ }
mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage)
if mobj is not None:
request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '')
response = self._download_webpage(request, video_id,
- u'Downloading video info')
+ 'Downloading video info')
info = json.loads(base64.b64decode(response).decode('utf-8'))
- return {'id': video_id,
- 'title': info['title'],
- 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'),
- 'play_path': info['filename'],
- 'ext': 'flv',
- 'thumbnail': info['thumbnail'][0]['url'],
- }
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'),
+ 'play_path': info['filename'],
+ 'ext': 'flv',
+ 'thumbnail': info['thumbnail'][0]['url'],
+ }
# try encxml
mobj = re.search('var flashvars={(.+?)}', webpage)
if mobj is None:
- raise ExtractorError(u'Unable to extract video')
+ raise ExtractorError('Unable to extract video')
params = {}
encxml = ''
@@ -118,7 +111,7 @@ class MyVideoIE(InfoExtractor):
params['domain'] = 'www.myvideo.de'
xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
if 'flash_playertype=MTV' in xmldata_url:
- self._downloader.report_warning(u'avoiding MTV player')
+ self._downloader.report_warning('avoiding MTV player')
xmldata_url = (
'http://www.myvideo.de/dynamic/get_player_video_xml.php'
'?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
@@ -144,7 +137,7 @@ class MyVideoIE(InfoExtractor):
video_url = compat_urllib_parse.unquote(mobj.group(1))
if 'myvideo2flash' in video_url:
self.report_warning(
- u'Rewriting URL to use unencrypted rtmp:// ...',
+ 'Rewriting URL to use unencrypted rtmp:// ...',
video_id)
video_url = video_url.replace('rtmpe://', 'rtmp://')
@@ -152,39 +145,31 @@ class MyVideoIE(InfoExtractor):
# extract non rtmp videos
mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
if mobj is None:
- raise ExtractorError(u'unable to extract url')
+ raise ExtractorError('unable to extract url')
video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
- video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
+ video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
video_file = compat_urllib_parse.unquote(video_file)
if not video_file.endswith('f4m'):
ppath, prefix = video_file.split('.')
video_playpath = '%s:%s' % (prefix, ppath)
- video_hls_playlist = ''
else:
video_playpath = ''
- video_hls_playlist = (
- video_file
- ).replace('.f4m', '.m3u8')
- video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
+ video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
- webpage, u'title')
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'tc_url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': u'flv',
- 'play_path': video_playpath,
- 'video_file': video_file,
- 'video_hls_playlist': video_hls_playlist,
- 'player_url': video_swfobj,
- }]
+ webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'tc_url': video_url,
+ 'title': video_title,
+ 'ext': 'flv',
+ 'play_path': video_playpath,
+ 'player_url': video_swfobj,
+ }
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 4cab306..c0231c1 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -1,4 +1,6 @@
# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -12,12 +14,13 @@ class NaverIE(InfoExtractor):
_VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
_TEST = {
- u'url': u'http://tvcast.naver.com/v/81652',
- u'file': u'81652.mp4',
- u'info_dict': {
- u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
- u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
- u'upload_date': u'20130903',
+ 'url': 'http://tvcast.naver.com/v/81652',
+ 'info_dict': {
+ 'id': '81652',
+ 'ext': 'mp4',
+ 'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
+ 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
+ 'upload_date': '20130903',
},
}
@@ -28,7 +31,7 @@ class NaverIE(InfoExtractor):
m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
webpage)
if m_id is None:
- raise ExtractorError(u'couldn\'t extract vid and key')
+ raise ExtractorError('couldn\'t extract vid and key')
vid = m_id.group(1)
key = m_id.group(2)
query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,})
@@ -39,22 +42,27 @@ class NaverIE(InfoExtractor):
})
info = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
- video_id, u'Downloading video info')
+ video_id, 'Downloading video info')
urls = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
- video_id, u'Downloading video formats info')
+ video_id, 'Downloading video formats info')
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
domain = format_el.find('Domain').text
- if domain.startswith('rtmp'):
- continue
- formats.append({
+ f = {
'url': domain + format_el.find('uri').text,
'ext': 'mp4',
'width': int(format_el.find('width').text),
'height': int(format_el.find('height').text),
- })
+ }
+ if domain.startswith('rtmp'):
+ f.update({
+ 'ext': 'flv',
+ 'rtmp_protocol': '1', # rtmpt
+ })
+ formats.append(f)
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index 7e42161..633b42f 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -6,12 +6,13 @@ from .common import InfoExtractor
class NBAIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
_TEST = {
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
- 'file': u'0021200253-okc-bkn-recap.nba.mp4',
'md5': u'c0edcfc37607344e2ff8f13c378c88a4',
'info_dict': {
+ 'id': '0021200253-okc-bkn-recap.nba',
+ 'ext': 'mp4',
'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
'title': 'Thunder vs. Nets',
},
@@ -19,7 +20,7 @@ class NBAIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
@@ -33,7 +34,6 @@ class NBAIE(InfoExtractor):
return {
'id': shortened_video_id,
'url': video_url,
- 'ext': 'mp4',
'title': title,
'description': description,
}
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index e8bbfff..aa34665 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -1,32 +1,99 @@
+from __future__ import unicode_literals
+
import re
+import json
from .common import InfoExtractor
from ..utils import find_xpath_attr, compat_str
-class NBCNewsIE(InfoExtractor):
- _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
+class NBCIE(InfoExtractor):
+ _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
_TEST = {
- u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
- u'file': u'52753292.flv',
- u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
- u'info_dict': {
- u'title': u'Crew emerges after four-month Mars food study',
- u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+ 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+ 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e',
+ 'info_dict': {
+ 'id': 'u1RInQZRN7QJ',
+ 'ext': 'flv',
+ 'title': 'I Am a Firefighter',
+ 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
- info = all_info.find('video')
+ webpage = self._download_webpage(url, video_id)
+ theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
+ if theplatform_url.startswith('//'):
+ theplatform_url = 'http:' + theplatform_url
+ return self.url_result(theplatform_url)
+
+
+class NBCNewsIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://www\.nbcnews\.com/
+ ((video/.+?/(?P<id>\d+))|
+ (feature/[^/]+/(?P<title>.+)))
+ '''
- return {'id': video_id,
+ _TESTS = [
+ {
+ 'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
+ 'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
+ 'info_dict': {
+ 'id': '52753292',
+ 'ext': 'flv',
+ 'title': 'Crew emerges after four-month Mars food study',
+ 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+ },
+ },
+ {
+ 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
+ 'md5': 'b2421750c9f260783721d898f4c42063',
+ 'info_dict': {
+ 'id': 'I1wpAI_zmhsQ',
+ 'ext': 'flv',
+ 'title': 'How Twitter Reacted To The Snowden Interview',
+ 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
+ },
+ 'add_ie': ['ThePlatform'],
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ if video_id is not None:
+ all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+ info = all_info.find('video')
+
+ return {
+ 'id': video_id,
'title': info.find('headline').text,
'ext': 'flv',
'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
'description': compat_str(info.find('caption').text),
'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
- }
+ }
+ else:
+ # "feature" pages use theplatform.com
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ bootstrap_json = self._search_regex(
+ r'var bootstrapJson = ({.+})\s*$', webpage, 'bootstrap json',
+ flags=re.MULTILINE)
+ bootstrap = json.loads(bootstrap_json)
+ info = bootstrap['results'][0]['video']
+ playlist_url = info['fallbackPlaylistUrl'] + '?form=MPXNBCNewsAPI'
+ mpxid = info['mpxId']
+ all_videos = self._download_json(playlist_url, title)['videos']
+ # The response contains additional videos
+ info = next(v for v in all_videos if v['mpxId'] == mpxid)
+
+ return {
+ '_type': 'url',
+ # We get the best quality video
+ 'url': info['videoAssets'][-1]['publicUrl'],
+ 'ie_key': 'ThePlatform',
+ }
diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py
index 0650f95..3d6096e 100644
--- a/youtube_dl/extractor/ndr.py
+++ b/youtube_dl/extractor/ndr.py
@@ -4,7 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ qualities,
+)
class NDRIE(InfoExtractor):
@@ -45,17 +49,16 @@ class NDRIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page')
- title = self._og_search_title(page)
+ title = self._og_search_title(page).strip()
description = self._og_search_description(page)
+ if description:
+ description = description.strip()
- mobj = re.search(
- r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
- page)
- duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
+ duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
formats = []
- mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
+ mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
if mp3_url:
formats.append({
'url': mp3_url.group('audio'),
@@ -64,13 +67,15 @@ class NDRIE(InfoExtractor):
thumbnail = None
- video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
+ video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
if video_url:
- thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
- page, 'thumbnail', fatal=False)
- if thumbnail:
- thumbnail = 'http://www.ndr.de' + thumbnail
- for format_id in ['lo', 'hi', 'hq']:
+ thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
+ if thumbnails:
+ quality_key = qualities(['xs', 's', 'm', 'l', 'xl'])
+ largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1]))
+ thumbnail = 'http://www.ndr.de' + largest[0]
+
+ for format_id in 'lo', 'hi', 'hq':
formats.append({
'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
'format_id': format_id,
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
new file mode 100644
index 0000000..2fd5b8f
--- /dev/null
+++ b/youtube_dl/extractor/newstube.py
@@ -0,0 +1,87 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class NewstubeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'
+ _TEST = {
+ 'url': 'http://newstube.ru/media/na-korable-progress-prodolzhaetsya-testirovanie-sistemy-kurs',
+ 'info_dict': {
+ 'id': 'd156a237-a6e9-4111-a682-039995f721f1',
+ 'ext': 'flv',
+ 'title': 'На корабле «Прогресс» продолжается тестирование системы «Курс»',
+ 'description': 'md5:d0cbe7b4a6f600552617e48548d5dc77',
+ 'duration': 20.04,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id, 'Downloading page')
+
+ video_guid = self._html_search_regex(
+ r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+ page, 'video GUID')
+
+ player = self._download_xml(
+ 'http://p.newstube.ru/v2/player.asmx/GetAutoPlayInfo6?state=&url=%s&sessionId=&id=%s&placement=profile&location=n2' % (url, video_guid),
+ video_guid, 'Downloading player XML')
+
+ def ns(s):
+ return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'}
+
+ session_id = player.find(ns('./SessionId')).text
+ media_info = player.find(ns('./Medias/MediaInfo'))
+ title = media_info.find(ns('./Name')).text
+ description = self._og_search_description(page)
+ thumbnail = media_info.find(ns('./KeyFrame')).text
+ duration = int(media_info.find(ns('./Duration')).text) / 1000.0
+
+ formats = []
+
+ for stream_info in media_info.findall(ns('./Streams/StreamInfo')):
+ media_location = stream_info.find(ns('./MediaLocation'))
+ if media_location is None:
+ continue
+
+ server = media_location.find(ns('./Server')).text
+ app = media_location.find(ns('./App')).text
+ media_id = stream_info.find(ns('./Id')).text
+ quality_id = stream_info.find(ns('./QualityId')).text
+ name = stream_info.find(ns('./Name')).text
+ width = int(stream_info.find(ns('./Width')).text)
+ height = int(stream_info.find(ns('./Height')).text)
+
+ formats.append({
+ 'url': 'rtmp://%s/%s' % (server, app),
+ 'app': app,
+ 'play_path': '01/%s' % video_guid.upper(),
+ 'rtmp_conn': ['S:%s' % session_id, 'S:%s' % media_id, 'S:n2'],
+ 'page_url': url,
+ 'ext': 'flv',
+ 'format_id': quality_id,
+ 'format_note': name,
+ 'width': width,
+ 'height': height,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_guid,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py
index e88566c..ba7b77a 100644
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -73,14 +73,16 @@ class NFBIE(InfoExtractor):
title = media.find('title').text
description = media.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
- formats = [{
- 'url': x.find('default/streamerURI').text,
- 'app': x.find('default/streamerURI').text.split('/', 3)[3],
- 'play_path': x.find('default/url').text,
- 'rtmp_live': False,
- 'ext': 'mp4',
- 'format_id': x.get('quality'),
- } for x in media.findall('assets/asset')]
+ for asset in media.findall('assets/asset'):
+ for x in asset:
+ formats.append({
+ 'url': x.find('streamerURI').text,
+ 'app': x.find('streamerURI').text.split('/', 3)[3],
+ 'play_path': x.find('url').text,
+ 'rtmp_live': False,
+ 'ext': 'mp4',
+ 'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+ })
return {
'id': video_id,
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 4677431..517a725 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -1,12 +1,10 @@
# encoding: utf-8
+from __future__ import unicode_literals
import re
-import socket
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_urllib_error,
compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
@@ -18,57 +16,54 @@ from ..utils import (
class NiconicoIE(InfoExtractor):
- IE_NAME = u'niconico'
- IE_DESC = u'ニコニコ動画'
+ IE_NAME = 'niconico'
+ IE_DESC = 'ニコニコ動画'
_TEST = {
- u'url': u'http://www.nicovideo.jp/watch/sm22312215',
- u'file': u'sm22312215.mp4',
- u'md5': u'd1a75c0823e2f629128c43e1212760f9',
- u'info_dict': {
- u'title': u'Big Buck Bunny',
- u'uploader': u'takuya0301',
- u'uploader_id': u'2698420',
- u'upload_date': u'20131123',
- u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+ 'url': 'http://www.nicovideo.jp/watch/sm22312215',
+ 'md5': 'd1a75c0823e2f629128c43e1212760f9',
+ 'info_dict': {
+ 'id': 'sm22312215',
+ 'ext': 'mp4',
+ 'title': 'Big Buck Bunny',
+ 'uploader': 'takuya0301',
+ 'uploader_id': '2698420',
+ 'upload_date': '20131123',
+ 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
},
- u'params': {
- u'username': u'ydl.niconico@gmail.com',
- u'password': u'youtube-dl',
+ 'params': {
+ 'username': 'ydl.niconico@gmail.com',
+ 'password': 'youtube-dl',
},
}
_VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
_NETRC_MACHINE = 'niconico'
- # If True it will raise an error if no login info is provided
- _LOGIN_REQUIRED = True
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
- # No authentication to be performed
if username is None:
- if self._LOGIN_REQUIRED:
- raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
- return False
+ # Login is required
+ raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
# Log in
login_form_strs = {
- u'mail': username,
- u'password': password,
+ 'mail': username,
+ 'password': password,
}
# Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
# chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
request = compat_urllib_request.Request(
- u'https://secure.nicovideo.jp/secure/login', login_data)
+ 'https://secure.nicovideo.jp/secure/login', login_data)
login_results = self._download_webpage(
- request, u'', note=u'Logging in', errnote=u'Unable to log in')
+ request, None, note='Logging in', errnote='Unable to log in')
if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
+ self._downloader.report_warning('unable to log in: bad username or password')
return False
return True
@@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor):
video_info = self._download_xml(
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
- note=u'Downloading video info page')
+ note='Downloading video info page')
# Get flv info
flv_info_webpage = self._download_webpage(
- u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
- video_id, u'Downloading flv info')
+ 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ video_id, 'Downloading flv info')
video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
# Start extracting information
@@ -106,22 +101,22 @@ class NiconicoIE(InfoExtractor):
url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
try:
user_info = self._download_xml(
- url, video_id, note=u'Downloading user information')
+ url, video_id, note='Downloading user information')
video_uploader = user_info.find('.//nickname').text
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+ except ExtractorError as err:
+ self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err))
return {
- 'id': video_id,
- 'url': video_real_url,
- 'title': video_title,
- 'ext': video_extension,
- 'format': video_format,
- 'thumbnail': video_thumbnail,
+ 'id': video_id,
+ 'url': video_real_url,
+ 'title': video_title,
+ 'ext': video_extension,
+ 'format': video_format,
+ 'thumbnail': video_thumbnail,
'description': video_description,
- 'uploader': video_uploader,
+ 'uploader': video_uploader,
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
- 'view_count': video_view_count,
+ 'view_count': video_view_count,
'webpage_url': video_webpage_url,
}
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index 2b7236b..c2e7b67 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -1,45 +1,68 @@
from __future__ import unicode_literals
-import json
import re
+import json
from .common import InfoExtractor
+from ..utils import str_to_int
class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
- _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
+ _VALID_URL = r'''(?x)^https?://(?:www\.)?9gag\.tv/
+ (?:
+ v/(?P<numid>[0-9]+)|
+ p/(?P<id>[a-zA-Z0-9]+)/(?P<display_id>[^?#/]+)
+ )
+ '''
- _TEST = {
+ _TESTS = [{
"url": "http://9gag.tv/v/1912",
- "file": "1912.mp4",
"info_dict": {
+ "id": "1912",
+ "ext": "mp4",
"description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
- "title": "\"People Are Awesome 2013\" Is Absolutely Awesome"
+ "title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
+ "view_count": int,
+ "thumbnail": "re:^https?://",
},
'add_ie': ['Youtube']
- }
+ },
+ {
+ 'url': 'http://9gag.tv/p/KklwM/alternate-banned-opening-scene-of-gravity?ref=fsidebar',
+ 'info_dict': {
+ 'id': 'KklwM',
+ 'ext': 'mp4',
+ 'display_id': 'alternate-banned-opening-scene-of-gravity',
+ "description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.",
+ 'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie",
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.group('numid') or mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ webpage = self._download_webpage(url, display_id)
- webpage = self._download_webpage(url, video_id)
- data_json = self._html_search_regex(r'''(?x)
- <div\s*id="tv-video"\s*data-video-source="youtube"\s*
- data-video-meta="([^"]+)"''', webpage, 'video metadata')
+ post_view = json.loads(self._html_search_regex(
+ r'var postView = new app\.PostView\({\s*post:\s*({.+?}),', webpage, 'post view'))
- data = json.loads(data_json)
+ youtube_id = post_view['videoExternalId']
+ title = post_view['title']
+ description = post_view['description']
+ view_count = str_to_int(post_view['externalView'])
+ thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
return {
'_type': 'url_transparent',
- 'url': data['youtubeVideoId'],
+ 'url': youtube_id,
'ie_key': 'Youtube',
'id': video_id,
- 'title': data['title'],
- 'description': data['description'],
- 'view_count': int(data['view_count']),
- 'like_count': int(data['statistic']['like']),
- 'dislike_count': int(data['statistic']['dislike']),
- 'thumbnail': data['thumbnail_url'],
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'view_count': view_count,
+ 'thumbnail': thumbnail,
}
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
new file mode 100644
index 0000000..d451cd1
--- /dev/null
+++ b/youtube_dl/extractor/noco.py
@@ -0,0 +1,106 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+ compat_str,
+)
+
+
+class NocoIE(InfoExtractor):
+ _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+ 'md5': '0a993f0058ddbcd902630b2047ef710e',
+ 'info_dict': {
+ 'id': '11538',
+ 'ext': 'mp4',
+ 'title': 'Ami Ami Idol - Hello! France',
+ 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+ 'upload_date': '20140412',
+ 'uploader': 'Nolife',
+ 'uploader_id': 'NOL',
+ 'duration': 2851.2,
+ },
+ 'skip': 'Requires noco account',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ medias = self._download_json(
+ 'http://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON')
+
+ formats = []
+
+ for fmt in medias['fr']['video_list']['default']['quality_list']:
+ format_id = fmt['quality_key']
+
+ file = self._download_json(
+ 'http://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id),
+ video_id, 'Downloading %s video JSON' % format_id)
+
+ file_url = file['file']
+ if not file_url:
+ continue
+
+ if file_url == 'forbidden':
+ raise ExtractorError(
+ '%s returned error: %s - %s' % (
+ self.IE_NAME, file['popmessage']['title'], file['popmessage']['message']),
+ expected=True)
+
+ formats.append({
+ 'url': file_url,
+ 'format_id': format_id,
+ 'width': fmt['res_width'],
+ 'height': fmt['res_lines'],
+ 'abr': fmt['audiobitrate'],
+ 'vbr': fmt['videobitrate'],
+ 'filesize': fmt['filesize'],
+ 'format_note': fmt['quality_name'],
+ 'preference': fmt['priority'],
+ })
+
+ self._sort_formats(formats)
+
+ show = self._download_json(
+ 'http://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0]
+
+ upload_date = unified_strdate(show['indexed'])
+ uploader = show['partner_name']
+ uploader_id = show['partner_key']
+ duration = show['duration_ms'] / 1000.0
+ thumbnail = show['screenshot']
+
+ episode = show.get('show_TT') or show.get('show_OT')
+ family = show.get('family_TT') or show.get('family_OT')
+ episode_number = show.get('episode_number')
+
+ title = ''
+ if family:
+ title += family
+ if episode_number:
+ title += ' #' + compat_str(episode_number)
+ if episode:
+ title += ' - ' + episode
+
+ description = show.get('show_resume') or show.get('family_resume')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py
index 81b7855..25e71a5 100644
--- a/youtube_dl/extractor/normalboots.py
+++ b/youtube_dl/extractor/normalboots.py
@@ -1,61 +1,51 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
unified_strdate,
)
+
class NormalbootsIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
+ _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
_TEST = {
- u'url': u'http://normalboots.com/video/home-alone-games-jontron/',
- u'file': u'home-alone-games-jontron.mp4',
- u'md5': u'8bf6de238915dd501105b44ef5f1e0f6',
- u'info_dict': {
- u'title': u'Home Alone Games - JonTron - NormalBoots',
- u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/',
- u'uploader': u'JonTron',
- u'upload_date': u'20140125',
+ 'url': 'http://normalboots.com/video/home-alone-games-jontron/',
+ 'md5': '8bf6de238915dd501105b44ef5f1e0f6',
+ 'info_dict': {
+ 'id': 'home-alone-games-jontron',
+ 'ext': 'mp4',
+ 'title': 'Home Alone Games - JonTron - NormalBoots',
+ 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
+ 'uploader': 'JonTron',
+ 'upload_date': '20140125',
}
}
-
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('videoid')
-
- info = {
- 'id': video_id,
- 'uploader': None,
- 'upload_date': None,
- }
-
- if url[:4] != 'http':
- url = 'http://' + url
-
+
webpage = self._download_webpage(url, video_id)
- video_title = self._og_search_title(webpage)
- video_description = self._og_search_description(webpage)
- video_thumbnail = self._og_search_thumbnail(webpage)
video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
webpage, 'uploader')
- raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
+ raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
webpage, 'date')
video_upload_date = unified_strdate(raw_upload_date)
- video_upload_date = unified_strdate(raw_upload_date)
-
+
player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
player_page = self._download_webpage(player_url, video_id)
- video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file')
-
- info['url'] = video_url
- info['title'] = video_title
- info['description'] = video_description
- info['thumbnail'] = video_thumbnail
- info['uploader'] = video_uploader
- info['upload_date'] = video_upload_date
-
- return info
+ video_url = self._html_search_regex(r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ }
diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py
index 6af8d93..2e7ab1e 100644
--- a/youtube_dl/extractor/novamov.py
+++ b/youtube_dl/extractor/novamov.py
@@ -9,14 +9,26 @@ from ..utils import (
)
-class NovamovIE(InfoExtractor):
- _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})'
+class NovaMovIE(InfoExtractor):
+ IE_NAME = 'novamov'
+ IE_DESC = 'NovaMov'
+
+ _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})'
+ _VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'}
+
+ _HOST = 'www.novamov.com'
+
+ _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>'
+ _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
+ _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>'
+ _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>'
_TEST = {
'url': 'http://www.novamov.com/video/4rurhn9x446jj',
- 'file': '4rurhn9x446jj.flv',
'md5': '7205f346a52bbeba427603ba10d4b935',
'info_dict': {
+ 'id': '4rurhn9x446jj',
+ 'ext': 'flv',
'title': 'search engine optimization',
'description': 'search engine optimization is used to rank the web page in the google search engine'
},
@@ -25,33 +37,27 @@ class NovamovIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
-
- page = self._download_webpage('http://www.novamov.com/video/%s' % video_id,
- video_id, 'Downloading video page')
+ video_id = mobj.group('id')
- if re.search(r'This file no longer exists on our servers!</h2>', page) is not None:
- raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+ page = self._download_webpage(
+ 'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
- filekey = self._search_regex(
- r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey')
+ if re.search(self._FILE_DELETED_REGEX, page) is not None:
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- title = self._html_search_regex(
- r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>',
- page, 'title', fatal=False)
+ filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
- description = self._html_search_regex(
- r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>',
- page, 'description', fatal=False)
+ title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
+ description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
api_response = self._download_webpage(
- 'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id),
- video_id, 'Downloading video api response')
+ 'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
+ 'Downloading video api response')
response = compat_urlparse.parse_qs(api_response)
if 'error_msg' in response:
- raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True)
+ raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True)
video_url = response['url'][0]
@@ -60,4 +66,4 @@ class NovamovIE(InfoExtractor):
'url': video_url,
'title': title,
'description': description
- }
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index b1bcb7e..1c5e940 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -4,9 +4,7 @@ import re
from .brightcove import BrightcoveIE
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
-)
+from ..utils import ExtractorError
class NownessIE(InfoExtractor):
@@ -14,9 +12,10 @@ class NownessIE(InfoExtractor):
_TEST = {
'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation',
- 'file': '2520295746001.mp4',
- 'md5': '0ece2f70a7bd252c7b00f3070182d418',
+ 'md5': '068bc0202558c2e391924cb8cc470676',
'info_dict': {
+ 'id': '2520295746001',
+ 'ext': 'mp4',
'description': 'Candor: The Art of Gesticulation',
'uploader': 'Nowness',
'title': 'Candor: The Art of Gesticulation',
diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py
index 168ca8b..bfba184 100644
--- a/youtube_dl/extractor/nowvideo.py
+++ b/youtube_dl/extractor/nowvideo.py
@@ -1,46 +1,28 @@
-import re
+from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import compat_urlparse
+from .novamov import NovaMovIE
-class NowVideoIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)'
- _TEST = {
- u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
- u'file': u'0mw0yow7b6dxa.flv',
- u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
- u'info_dict': {
- u"title": u"youtubedl test video _BaW_jenozKc.mp4"
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
- webpage_url = 'http://www.nowvideo.ch/video/' + video_id
- embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
- embed_page = self._download_webpage(embed_url, video_id,
- u'Downloading embed page')
+class NowVideoIE(NovaMovIE):
+ IE_NAME = 'nowvideo'
+ IE_DESC = 'NowVideo'
- self.report_extraction(video_id)
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co)'}
- video_title = self._html_search_regex(r'<h4>(.*)</h4>',
- webpage, u'video title')
+ _HOST = 'www.nowvideo.ch'
- video_key = self._search_regex(r'var fkzd="(.*)";',
- embed_page, u'video key')
+ _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+ _FILEKEY_REGEX = r'var fkzd="([^"]+)";'
+ _TITLE_REGEX = r'<h4>([^<]+)</h4>'
+ _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>'
- api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
- api_response = self._download_webpage(api_call, video_id,
- u'Downloading API page')
- video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
- }]
+ _TEST = {
+ 'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+ 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817',
+ 'info_dict': {
+ 'id': '0mw0yow7b6dxa',
+ 'ext': 'flv',
+ 'title': 'youtubedl test video _BaW_jenozKc.mp4',
+ 'description': 'Description',
+ }
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
new file mode 100644
index 0000000..3a6a788
--- /dev/null
+++ b/youtube_dl/extractor/nrk.py
@@ -0,0 +1,145 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ unified_strdate,
+)
+
+
+class NRKIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/',
+ 'md5': 'a6eac35052f3b242bb6bb7f43aed5886',
+ 'info_dict': {
+ 'id': '150533',
+ 'ext': 'flv',
+ 'title': 'Dompap og andre fugler i Piip-Show',
+ 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f'
+ }
+ },
+ {
+ 'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/',
+ 'md5': '3471f2a51718195164e88f46bf427668',
+ 'info_dict': {
+ 'id': '154915',
+ 'ext': 'flv',
+ 'title': 'Slik høres internett ut når du er blind',
+ 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id)
+
+ video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id')
+
+ data = self._download_json(
+ 'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON')
+
+ if data['usageRights']['isGeoBlocked']:
+ raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True)
+
+ video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124'
+
+ images = data.get('images')
+ if images:
+ thumbnails = images['webImages']
+ thumbnails.sort(key=lambda image: image['pixelWidth'])
+ thumbnail = thumbnails[-1]['imageUrl']
+ else:
+ thumbnail = None
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': data['title'],
+ 'description': data['description'],
+ 'thumbnail': thumbnail,
+ }
+
+
+class NRKTVIE(InfoExtractor):
+ _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})'
+
+ _TESTS = [
+ {
+ 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/muhh48000314/23-05-2014',
+ 'md5': '7b96112fbae1faf09a6f9ae1aff6cb84',
+ 'info_dict': {
+ 'id': 'muhh48000314',
+ 'ext': 'flv',
+ 'title': '20 spørsmål',
+ 'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+ 'upload_date': '20140523',
+ 'duration': 1741.52,
+ }
+ },
+ {
+ 'url': 'http://tv.nrk.no/program/mdfp15000514',
+ 'md5': '383650ece2b25ecec996ad7b5bb2a384',
+ 'info_dict': {
+ 'id': 'mdfp15000514',
+ 'ext': 'flv',
+ 'title': 'Kunnskapskanalen: Grunnlovsjubiléet - Stor ståhei for ingenting',
+ 'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
+ 'upload_date': '20140524',
+ 'duration': 4605.0,
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta('title', page, 'title')
+ description = self._html_search_meta('description', page, 'description')
+ thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False)
+ upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False))
+ duration = self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)
+ if duration:
+ duration = float(duration)
+
+ formats = []
+
+ f4m_url = re.search(r'data-media="([^"]+)"', page)
+ if f4m_url:
+ formats.append({
+ 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+ 'format_id': 'f4m',
+ 'ext': 'flv',
+ })
+
+ m3u8_url = re.search(r'data-hls-media="([^"]+)"', page)
+ if m3u8_url:
+ formats.append({
+ 'url': m3u8_url.group(1),
+ 'format_id': 'm3u8',
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py
new file mode 100644
index 0000000..733ed6c
--- /dev/null
+++ b/youtube_dl/extractor/ntv.py
@@ -0,0 +1,149 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unescapeHTML
+)
+
+
+class NTVIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.ntv.ru/novosti/863142/',
+ 'info_dict': {
+ 'id': '746000',
+ 'ext': 'flv',
+ 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+ 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+ 'duration': 136,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/video/novosti/750370/',
+ 'info_dict': {
+ 'id': '750370',
+ 'ext': 'flv',
+ 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+ 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+ 'duration': 172,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
+ 'info_dict': {
+ 'id': '747480',
+ 'ext': 'flv',
+ 'title': '«Сегодня». 21 марта 2014 года. 16:00 ',
+ 'description': '«Сегодня». 21 марта 2014 года. 16:00 ',
+ 'duration': 1496,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/kino/Koma_film',
+ 'info_dict': {
+ 'id': '758100',
+ 'ext': 'flv',
+ 'title': 'Остросюжетный фильм «Кома»',
+ 'description': 'Остросюжетный фильм «Кома»',
+ 'duration': 5592,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
+ 'info_dict': {
+ 'id': '751482',
+ 'ext': 'flv',
+ 'title': '«Дело врачей»: «Деревце жизни»',
+ 'description': '«Дело врачей»: «Деревце жизни»',
+ 'duration': 2590,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ ]
+
+ _VIDEO_ID_REGEXES = [
+ r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
+ r'<video embed=[^>]+><id>(\d+)</id>',
+ r'<video restriction[^>]+><key>(\d+)</key>',
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id)
+
+ video_id = self._html_search_regex(self._VIDEO_ID_REGEXES, page, 'video id')
+
+ player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML')
+ title = unescapeHTML(player.find('./data/title').text)
+ description = unescapeHTML(player.find('./data/description').text)
+
+ video = player.find('./data/video')
+ video_id = video.find('./id').text
+ thumbnail = video.find('./splash').text
+ duration = int(video.find('./totaltime').text)
+ view_count = int(video.find('./views').text)
+ puid22 = video.find('./puid22').text
+
+ apps = {
+ '4': 'video1',
+ '7': 'video2',
+ }
+
+ app = apps.get(puid22, apps['4'])
+
+ formats = []
+ for format_id in ['', 'hi', 'webm']:
+ file = video.find('./%sfile' % format_id)
+ if file is None:
+ continue
+ size = video.find('./%ssize' % format_id)
+ formats.append({
+ 'url': 'rtmp://media.ntv.ru/%s' % app,
+ 'app': app,
+ 'play_path': file.text,
+ 'rtmp_conn': 'B:1',
+ 'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
+ 'page_url': 'http://www.ntv.ru',
+ 'flash_ver': 'LNX 11,2,202,341',
+ 'rtmp_live': True,
+ 'ext': 'flv',
+ 'filesize': int(size.text),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py
new file mode 100644
index 0000000..e3db9fe
--- /dev/null
+++ b/youtube_dl/extractor/nuvid.py
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class NuvidIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://m.nuvid.com/video/1310741/',
+ 'md5': 'eab207b7ac4fccfb4e23c86201f11277',
+ 'info_dict': {
+ 'id': '1310741',
+ 'ext': 'mp4',
+ "title": "Horny babes show their awesome bodeis and",
+ "age_limit": 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ murl = url.replace('://www.', '://m.')
+ webpage = self._download_webpage(murl, video_id)
+
+ title = self._html_search_regex(
+ r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>',
+ webpage, 'title').strip()
+
+ url_end = self._html_search_regex(
+ r'href="(/[^"]+)"[^>]*data-link_type="mp4"',
+ webpage, 'video_url')
+ video_url = 'http://m.nuvid.com' + url_end
+
+ thumbnail = self._html_search_regex(
+ r'href="(/thumbs/[^"]+)"[^>]*data-link_type="thumbs"',
+ webpage, 'thumbnail URL', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18,
+ }
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
new file mode 100644
index 0000000..7bf105d
--- /dev/null
+++ b/youtube_dl/extractor/nytimes.py
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class NYTimesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
+ 'md5': '18a525a510f942ada2720db5f31644c0',
+ 'info_dict': {
+ 'id': '100000002847155',
+ 'ext': 'mov',
+ 'title': 'Verbatim: What Is a Photocopier?',
+ 'description': 'md5:93603dada88ddbda9395632fdc5da260',
+ 'timestamp': 1398631707,
+ 'upload_date': '20140427',
+ 'uploader': 'Brett Weiner',
+ 'duration': 419,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ video_data = self._download_json(
+ 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
+
+ title = video_data['headline']
+ description = video_data['summary']
+ duration = video_data['duration'] / 1000.0
+
+ uploader = video_data['byline']
+ timestamp = parse_iso8601(video_data['publication_date'][:-8])
+
+ def get_file_size(file_size):
+ if isinstance(file_size, int):
+ return file_size
+ elif isinstance(file_size, dict):
+ return int(file_size.get('value', 0))
+ else:
+ return 0
+
+ formats = [
+ {
+ 'url': video['url'],
+ 'format_id': video['type'],
+ 'vcodec': video['video_codec'],
+ 'width': video['width'],
+ 'height': video['height'],
+ 'filesize': get_file_size(video['fileSize']),
+ } for video in video_data['renditions']
+ ]
+ self._sort_formats(formats)
+
+ thumbnails = [
+ {
+ 'url': 'http://www.nytimes.com/%s' % image['url'],
+ 'resolution': '%dx%d' % (image['width'], image['height']),
+ } for image in video_data['images']
+ ]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/oe1.py b/youtube_dl/extractor/oe1.py
new file mode 100644
index 0000000..38971ab
--- /dev/null
+++ b/youtube_dl/extractor/oe1.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import datetime
+import re
+
+from .common import InfoExtractor
+
+# audios on oe1.orf.at are only available for 7 days, so we can't
+# add tests.
+
+
+class OE1IE(InfoExtractor):
+ IE_DESC = 'oe1.orf.at'
+ _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ show_id = mobj.group('id')
+
+ data = self._download_json(
+ 'http://oe1.orf.at/programm/%s/konsole' % show_id,
+ show_id
+ )
+
+ timestamp = datetime.datetime.strptime('%s %s' % (
+ data['item']['day_label'],
+ data['item']['time']
+ ), '%d.%m.%Y %H:%M')
+ unix_timestamp = calendar.timegm(timestamp.utctimetuple())
+
+ return {
+ 'id': show_id,
+ 'title': data['item']['title'],
+ 'url': data['item']['url_stream'],
+ 'ext': 'mp3',
+ 'description': data['item'].get('info'),
+ 'timestamp': unix_timestamp
+ }
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 44312ba..13f1282 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -1,20 +1,23 @@
+from __future__ import unicode_literals
import re
import json
from .common import InfoExtractor
from ..utils import unescapeHTML
+
class OoyalaIE(InfoExtractor):
- _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'
+ _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
_TEST = {
# From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
- u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
- u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4',
- u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c',
- u'info_dict': {
- u'title': u'Explaining Data Recovery from Hard Drives and SSDs',
- u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+ 'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'md5': '3f5cceb3a7bf461d6c29dc466cf8033c',
+ 'info_dict': {
+ 'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+ 'ext': 'mp4',
+ 'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+ 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
},
}
@@ -28,13 +31,14 @@ class OoyalaIE(InfoExtractor):
ie=cls.ie_key())
def _extract_result(self, info, more_info):
- return {'id': info['embedCode'],
- 'ext': 'mp4',
- 'title': unescapeHTML(info['title']),
- 'url': info.get('ipad_url') or info['url'],
- 'description': unescapeHTML(more_info['description']),
- 'thumbnail': more_info['promo'],
- }
+ return {
+ 'id': info['embedCode'],
+ 'ext': 'mp4',
+ 'title': unescapeHTML(info['title']),
+ 'url': info.get('ipad_url') or info['url'],
+ 'description': unescapeHTML(more_info['description']),
+ 'thumbnail': more_info['promo'],
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -42,22 +46,23 @@ class OoyalaIE(InfoExtractor):
player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
player = self._download_webpage(player_url, embedCode)
mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
- player, u'mobile player url')
+ player, 'mobile player url')
mobile_player = self._download_webpage(mobile_url, embedCode)
videos_info = self._search_regex(
r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
- mobile_player, u'info').replace('\\"','"')
- videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"')
+ mobile_player, 'info').replace('\\"','"')
+ videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"')
videos_info = json.loads(videos_info)
videos_more_info =json.loads(videos_more_info)
if videos_more_info.get('lineup'):
videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
- return {'_type': 'playlist',
- 'id': embedCode,
- 'title': unescapeHTML(videos_more_info['title']),
- 'entries': videos,
- }
+ return {
+ '_type': 'playlist',
+ 'id': embedCode,
+ 'title': unescapeHTML(videos_more_info['title']),
+ 'entries': videos,
+ }
else:
return self._extract_result(videos_info[0], videos_more_info)
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 5f56943..03421d1 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..utils import (
HEADRequest,
unified_strdate,
+ ExtractorError,
)
@@ -35,7 +36,15 @@ class ORFIE(InfoExtractor):
data_json = self._search_regex(
r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
all_data = json.loads(data_json)
- sdata = all_data[0]['values']['segments']
+
+ def get_segments(all_data):
+ for data in all_data:
+ if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
+ return data['values']['segments']
+
+ sdata = get_segments(all_data)
+ if not sdata:
+ raise ExtractorError('Unable to extract segments')
def quality_to_int(s):
m = re.search('([0-9]+)', s)
diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py
new file mode 100644
index 0000000..0a423a0
--- /dev/null
+++ b/youtube_dl/extractor/parliamentliveuk.py
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class ParliamentLiveUKIE(InfoExtractor):
+ IE_NAME = 'parliamentlive.tv'
+ IE_DESC = 'UK parliament videos'
+ _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia',
+ 'info_dict': {
+ 'id': '15121',
+ 'ext': 'asf',
+ 'title': 'hoc home affairs committee, 18 mar 2014.pm',
+ 'description': 'md5:033b3acdf83304cd43946b2d5e5798d1',
+ },
+ 'params': {
+ 'skip_download': True, # Requires mplayer (mms)
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ asx_url = self._html_search_regex(
+ r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage,
+ 'metadata URL')
+ asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata')
+ video_url = asx.find('.//REF').attrib['HREF']
+
+ title = self._search_regex(
+ r'''(?x)player\.setClipDetails\(
+ (?:(?:[0-9]+|"[^"]+"),\s*){2}
+ "([^"]+",\s*"[^"]+)"
+ ''',
+ webpage, 'title').replace('", "', ', ')
+ description = self._html_search_regex(
+ r'(?s)<span id="MainContentPlaceHolder_CaptionsBlock_WitnessInfo">(.*?)</span>',
+ webpage, 'description')
+
+ return {
+ 'id': video_id,
+ 'ext': 'asf',
+ 'url': video_url,
+ 'title': title,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index e7e0042..64cded7 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -3,6 +3,9 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ US_RATINGS,
+)
class PBSIE(InfoExtractor):
@@ -13,7 +16,7 @@ class PBSIE(InfoExtractor):
# Article with embedded player
(?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
# Player
- video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/
+ video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
'''
@@ -57,6 +60,11 @@ class PBSIE(InfoExtractor):
info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
info = self._download_json(info_url, display_id)
+ rating_str = info.get('rating')
+ if rating_str is not None:
+ rating_str = rating_str.rpartition('-')[2]
+ age_limit = US_RATINGS.get(rating_str)
+
return {
'id': video_id,
'title': info['title'],
@@ -65,4 +73,5 @@ class PBSIE(InfoExtractor):
'description': info['program'].get('description'),
'thumbnail': info.get('image_url'),
'duration': info.get('duration'),
+ 'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py
index 305b797..8aa69c4 100644
--- a/youtube_dl/extractor/photobucket.py
+++ b/youtube_dl/extractor/photobucket.py
@@ -1,76 +1,45 @@
-import datetime
+from __future__ import unicode_literals
+
import json
import re
from .common import InfoExtractor
+from ..utils import compat_urllib_parse
-from ..utils import (
- ExtractorError,
-)
class PhotobucketIE(InfoExtractor):
- """Information extractor for photobucket.com."""
-
- # TODO: the original _VALID_URL was:
- # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
- # Check if it's necessary to keep the old extracion process
- _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
- IE_NAME = u'photobucket'
+ _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
_TEST = {
- u'url': u'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
- u'file': u'zpsc0c3b9fa.mp4',
- u'md5': u'7dabfb92b0a31f6c16cebc0f8e60ff99',
- u'info_dict': {
- u"upload_date": u"20130504",
- u"uploader": u"rachaneronas",
- u"title": u"Tired of Link Building? Try BacklinkMyDomain.com!"
+ 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
+ 'file': 'zpsc0c3b9fa.mp4',
+ 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
+ 'info_dict': {
+ 'timestamp': 1367669341,
+ 'upload_date': '20130504',
+ 'uploader': 'rachaneronas',
+ 'title': 'Tired of Link Building? Try BacklinkMyDomain.com!',
}
}
def _real_extract(self, url):
- # Extract id from URL
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
video_id = mobj.group('id')
-
video_extension = mobj.group('ext')
- # Retrieve video webpage to extract further information
webpage = self._download_webpage(url, video_id)
# Extract URL, uploader, and title from webpage
self.report_extraction(video_id)
- # We try first by looking the javascript code:
- mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
- if mobj is not None:
- info = json.loads(mobj.group('json'))
- return [{
- 'id': video_id,
- 'url': info[u'downloadUrl'],
- 'uploader': info[u'username'],
- 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
- 'title': info[u'title'],
- 'ext': video_extension,
- 'thumbnail': info[u'thumbUrl'],
- }]
-
- # We try looking in other parts of the webpage
- video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
- webpage, u'video URL')
-
- mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1).decode('utf-8')
- video_uploader = mobj.group(2).decode('utf-8')
-
- return [{
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_extension.decode('utf-8'),
- }]
+ info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
+ webpage, 'info json')
+ info = json.loads(info_json)
+ url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'uploader': info['username'],
+ 'timestamp': info['creationDate'],
+ 'title': info['title'],
+ 'ext': video_extension,
+ 'thumbnail': info['thumbUrl'],
+ }
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py
new file mode 100644
index 0000000..b1322f1
--- /dev/null
+++ b/youtube_dl/extractor/playvid.py
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+)
+
+
+class PlayvidIE(InfoExtractor):
+ _VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
+ _TEST = {
+ 'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
+ 'md5': '44930f8afa616efdf9482daf4fe53e1e',
+ 'info_dict': {
+ 'id': 'agbDDi7WZTV',
+ 'ext': 'mp4',
+ 'title': 'Michelle Lewin in Miami Beach',
+ 'duration': 240,
+ 'age_limit': 18,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_title = None
+ duration = None
+ video_thumbnail = None
+ formats = []
+
+ # most of the information is stored in the flashvars
+ flashvars = self._html_search_regex(
+ r'flashvars="(.+?)"', webpage, 'flashvars')
+
+ infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+ for info in infos:
+ videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
+ if videovars_match:
+ key = videovars_match.group(1)
+ val = videovars_match.group(2)
+
+ if key == 'title':
+ video_title = compat_urllib_parse.unquote_plus(val)
+ if key == 'duration':
+ try:
+ duration = int(val)
+ except ValueError:
+ pass
+ if key == 'big_thumb':
+ video_thumbnail = val
+
+ videourl_match = re.match(
+ r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
+ if videourl_match:
+ height = int(videourl_match.group('resolution'))
+ formats.append({
+ 'height': height,
+ 'url': val,
+ })
+ self._sort_formats(formats)
+
+ # Extract title - should be in the flashvars; if not, look elsewhere
+ if video_title is None:
+ video_title = self._html_search_regex(
+ r'<title>(.*?)</title', webpage, 'title')
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'duration': duration,
+ 'description': None,
+ 'age_limit': 18
+ }
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py
index 5820097..ffafd23 100644
--- a/youtube_dl/extractor/podomatic.py
+++ b/youtube_dl/extractor/podomatic.py
@@ -1,24 +1,41 @@
+from __future__ import unicode_literals
+
import json
import re
from .common import InfoExtractor
-
+from ..utils import int_or_none
class PodomaticIE(InfoExtractor):
IE_NAME = 'podomatic'
_VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
- _TEST = {
- u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
- u"file": u"2009-01-02T16_03_35-08_00.mp3",
- u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
- u"info_dict": {
- u"uploader": u"Science Teaching Tips",
- u"uploader_id": u"scienceteachingtips",
- u"title": u"64. When the Moon Hits Your Eye",
- u"duration": 446,
- }
- }
+ _TESTS = [
+ {
+ 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
+ 'md5': '84bb855fcf3429e6bf72460e1eed782d',
+ 'info_dict': {
+ 'id': '2009-01-02T16_03_35-08_00',
+ 'ext': 'mp3',
+ 'uploader': 'Science Teaching Tips',
+ 'uploader_id': 'scienceteachingtips',
+ 'title': '64. When the Moon Hits Your Eye',
+ 'duration': 446,
+ }
+ },
+ {
+ 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
+ 'md5': 'd2cf443931b6148e27638650e2638297',
+ 'info_dict': {
+ 'id': '2013-11-15T16_31_21-08_00',
+ 'ext': 'mp3',
+ 'uploader': 'Ostbahnhof / Techno Mix',
+ 'uploader_id': 'ostbahnhof',
+ 'title': 'Einunddreizig',
+ 'duration': 3799,
+ }
+ },
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -29,14 +46,16 @@ class PodomaticIE(InfoExtractor):
'?permalink=true&rtmp=0') %
(mobj.group('proto'), channel, video_id))
data_json = self._download_webpage(
- json_url, video_id, note=u'Downloading video info')
+ json_url, video_id, 'Downloading video info')
data = json.loads(data_json)
video_url = data['downloadLink']
+ if not video_url:
+ video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation'])
uploader = data['podcast']
title = data['title']
thumbnail = data['imageLocation']
- duration = int(data['length'] / 1000.0)
+ duration = int_or_none(data.get('length'), 1000)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 58f9c69..718fe9a 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -1,44 +1,81 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..utils import int_or_none
class PornHdIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+ _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
_TEST = {
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
- 'file': '1962.flv',
- 'md5': '35272469887dca97abd30abecc6cdf75',
+ 'md5': '956b8ca569f7f4d8ec563e2c41598441',
'info_dict': {
- "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
- "age_limit": 18,
+ 'id': '1962',
+ 'ext': 'mp4',
+ 'title': 'Sierra loves doing laundry',
+ 'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('video_id')
- video_title = mobj.group('video_title')
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- next_url = self._html_search_regex(
- r'&hd=(http.+?)&', webpage, 'video URL')
- next_url = compat_urllib_parse.unquote(next_url)
+ title = self._og_search_title(webpage)
+ TITLE_SUFFIX = ' porn HD Video | PornHD.com '
+ if title.endswith(TITLE_SUFFIX):
+ title = title[:-len(TITLE_SUFFIX)]
+
+ description = self._html_search_regex(
+ r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
+ view_count = int_or_none(self._html_search_regex(
+ r'(\d+) views </span>', webpage, 'view count', fatal=False))
+
+ formats = [
+ {
+ 'url': format_url,
+ 'ext': format.lower(),
+ 'format_id': '%s-%s' % (format.lower(), quality.lower()),
+ 'quality': 1 if quality.lower() == 'high' else 0,
+ } for format, quality, format_url in re.findall(
+ r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
+ ]
+
+ mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
+ if mobj:
+ flashvars = json.loads(mobj.group('flashvars'))
+ formats.extend([
+ {
+ 'url': flashvars['hashlink'].replace('?noProxy=1', ''),
+ 'ext': 'flv',
+ 'format_id': 'flv-low',
+ 'quality': 0,
+ },
+ {
+ 'url': flashvars['hd'].replace('?noProxy=1', ''),
+ 'ext': 'flv',
+ 'format_id': 'flv-high',
+ 'quality': 1,
+ }
+ ])
+ thumbnail = flashvars['urlWallpaper']
+ else:
+ thumbnail = self._og_search_thumbnail(webpage)
- video_url = self._download_webpage(
- next_url, video_id, note='Retrieving video URL',
- errnote='Could not retrieve video URL')
- age_limit = 18
+ self._sort_formats(formats)
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'flv',
- 'title': video_title,
- 'age_limit': age_limit,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index fdda69f..4118ee9 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -8,6 +8,7 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
+ str_to_int,
)
from ..aes import (
aes_decrypt_text
@@ -27,6 +28,12 @@ class PornHubIE(InfoExtractor):
}
}
+ def _extract_count(self, pattern, webpage, name):
+ count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
+ if count:
+ count = str_to_int(count)
+ return count
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
@@ -37,14 +44,22 @@ class PornHubIE(InfoExtractor):
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
- video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, 'uploader', fatal=False)
+ video_uploader = self._html_search_regex(
+ r'(?s)From:&nbsp;.+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<',
+ webpage, 'uploader', fatal=False)
thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
if thumbnail:
thumbnail = compat_urllib_parse.unquote(thumbnail)
+ view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+ like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+ dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+ comment_count = self._extract_count(
+ r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
+
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
if webpage.find('"encrypted":true') != -1:
- password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password').replace('+', ' ')
+ password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
formats = []
@@ -77,6 +92,10 @@ class PornHubIE(InfoExtractor):
'uploader': video_uploader,
'title': video_title,
'thumbnail': thumbnail,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'comment_count': comment_count,
'formats': formats,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
new file mode 100644
index 0000000..e4c4ad7
--- /dev/null
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -0,0 +1,286 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from hashlib import sha1
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ unified_strdate,
+)
+
+
+class ProSiebenSat1IE(InfoExtractor):
+ IE_NAME = 'prosiebensat1'
+ IE_DESC = 'ProSiebenSat.1 Digital'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
+ 'info_dict': {
+ 'id': '2104602',
+ 'ext': 'mp4',
+ 'title': 'Staffel 2, Episode 18 - Jahresrückblick',
+ 'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
+ 'upload_date': '20131231',
+ 'duration': 5845.04,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
+ 'info_dict': {
+ 'id': '2570327',
+ 'ext': 'mp4',
+ 'title': 'Lady-Umstyling für Audrina',
+ 'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
+ 'upload_date': '20131014',
+ 'duration': 606.76,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Seems to be broken',
+ },
+ {
+ 'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
+ 'info_dict': {
+ 'id': '2429369',
+ 'ext': 'mp4',
+ 'title': 'Countdown für die Autowerkstatt',
+ 'description': 'md5:809fc051a457b5d8666013bc40698817',
+ 'upload_date': '20140223',
+ 'duration': 2595.04,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
+ 'info_dict': {
+ 'id': '2904997',
+ 'ext': 'mp4',
+ 'title': 'Sexy laufen in Ugg Boots',
+ 'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
+ 'upload_date': '20140122',
+ 'duration': 245.32,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
+ 'info_dict': {
+ 'id': '2906572',
+ 'ext': 'mp4',
+ 'title': 'Im Interview: Kai Wiesinger',
+ 'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
+ 'upload_date': '20140225',
+ 'duration': 522.56,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
+ 'info_dict': {
+ 'id': '2992323',
+ 'ext': 'mp4',
+ 'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
+ 'description': 'md5:2669cde3febe9bce13904f701e774eb6',
+ 'upload_date': '20140225',
+ 'duration': 2410.44,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
+ 'info_dict': {
+ 'id': '3004256',
+ 'ext': 'mp4',
+ 'title': 'Schalke: Tönnies möchte Raul zurück',
+ 'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
+ 'upload_date': '20140226',
+ 'duration': 228.96,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
+ 'info_dict': {
+ 'id': '2572814',
+ 'ext': 'mp4',
+ 'title': 'Andreas Kümmert: Rocket Man',
+ 'description': 'md5:6ddb02b0781c6adf778afea606652e38',
+ 'upload_date': '20131017',
+ 'duration': 469.88,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
+ 'info_dict': {
+ 'id': '2156342',
+ 'ext': 'mp4',
+ 'title': 'Kurztrips zum Valentinstag',
+ 'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
+ 'upload_date': '20130206',
+ 'duration': 307.24,
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ },
+ ]
+
+ _CLIPID_REGEXES = [
+ r'"clip_id"\s*:\s+"(\d+)"',
+ r'clipid: "(\d+)"',
+ r'clipId=(\d+)',
+ ]
+ _TITLE_REGEXES = [
+ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
+ r'<header class="clearfix">\s*<h3>(.+?)</h3>',
+ r'<!-- start video -->\s*<h1>(.+?)</h1>',
+ r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>',
+ ]
+ _DESCRIPTION_REGEXES = [
+ r'<p itemprop="description">\s*(.+?)</p>',
+ r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
+ r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
+ r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">',
+ ]
+ _UPLOAD_DATE_REGEXES = [
+ r'<meta property="og:published_time" content="(.+?)">',
+ r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
+ r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
+ r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
+ r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage(url, video_id, 'Downloading page')
+
+ clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id')
+
+ access_token = 'testclient'
+ client_name = 'kolibri-1.2.5'
+ client_location = url
+
+ videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
+ 'access_token': access_token,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ 'ids': clip_id,
+ })
+
+ videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+
+ duration = float(videos[0]['duration'])
+ source_ids = [source['id'] for source in videos[0]['sources']]
+ source_ids_str = ','.join(map(str, source_ids))
+
+ g = '01!8d8F_)r9]4s[qeuXfP%'
+
+ client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
+ .encode('utf-8')).hexdigest()
+
+ sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
+ 'access_token': access_token,
+ 'client_id': client_id,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ }))
+
+ sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+ server_id = sources['server_id']
+
+ client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
+ client_location, source_ids_str, g, client_name])
+ .encode('utf-8')).hexdigest()
+
+ url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
+ 'access_token': access_token,
+ 'client_id': client_id,
+ 'client_location': client_location,
+ 'client_name': client_name,
+ 'server_id': server_id,
+ 'source_ids': source_ids_str,
+ }))
+
+ urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
+
+ title = self._html_search_regex(self._TITLE_REGEXES, page, 'title')
+ description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(page)
+
+ upload_date = unified_strdate(self._html_search_regex(
+ self._UPLOAD_DATE_REGEXES, page, 'upload date', fatal=False))
+
+ formats = []
+
+ urls_sources = urls['sources']
+ if isinstance(urls_sources, dict):
+ urls_sources = urls_sources.values()
+
+ def fix_bitrate(bitrate):
+ return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
+
+ for source in urls_sources:
+ protocol = source['protocol']
+ if protocol == 'rtmp' or protocol == 'rtmpe':
+ mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+ if not mobj:
+ continue
+ formats.append({
+ 'url': mobj.group('url'),
+ 'app': mobj.group('app'),
+ 'play_path': mobj.group('playpath'),
+ 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+ 'page_url': 'http://www.prosieben.de',
+ 'vbr': fix_bitrate(source['bitrate']),
+ 'ext': 'mp4',
+ 'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
+ })
+ else:
+ formats.append({
+ 'url': source['url'],
+ 'vbr': fix_bitrate(source['bitrate']),
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': clip_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
index 3305459..0bc0859 100644
--- a/youtube_dl/extractor/pyvideo.py
+++ b/youtube_dl/extractor/pyvideo.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
import os
@@ -5,45 +7,51 @@ from .common import InfoExtractor
class PyvideoIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
- _TESTS = [{
- u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
- u'file': u'24_4WWkSmNo.mp4',
- u'md5': u'de317418c8bc76b1fd8633e4f32acbc6',
- u'info_dict': {
- u"title": u"Become a logging expert in 30 minutes",
- u"description": u"md5:9665350d466c67fb5b1598de379021f7",
- u"upload_date": u"20130320",
- u"uploader": u"NextDayVideo",
- u"uploader_id": u"NextDayVideo",
+ _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
+
+ _TESTS = [
+ {
+ 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
+ 'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+ 'info_dict': {
+ 'id': '24_4WWkSmNo',
+ 'ext': 'mp4',
+ 'title': 'Become a logging expert in 30 minutes',
+ 'description': 'md5:9665350d466c67fb5b1598de379021f7',
+ 'upload_date': '20130320',
+ 'uploader': 'NextDayVideo',
+ 'uploader_id': 'NextDayVideo',
+ },
+ 'add_ie': ['Youtube'],
},
- u'add_ie': ['Youtube'],
- },
- {
- u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
- u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12',
- u'info_dict': {
- u'id': u'2542',
- u'ext': u'm4v',
- u'title': u'Gloriajw-SpotifyWithErikBernhardsson182',
+ {
+ 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
+ 'md5': '5fe1c7e0a8aa5570330784c847ff6d12',
+ 'info_dict': {
+ 'id': '2542',
+ 'ext': 'm4v',
+ 'title': 'Gloriajw-SpotifyWithErikBernhardsson182',
+ },
},
- },
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+
webpage = self._download_webpage(url, video_id)
- m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
+ m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
if m_youtube is not None:
return self.url_result(m_youtube.group(1), 'Youtube')
- title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>',
- webpage, u'title', flags=re.DOTALL)
- video_url = self._search_regex([r'<source src="(.*?)"',
- r'<dt>Download</dt>.*?<a href="(.+?)"'],
- webpage, u'video url', flags=re.DOTALL)
+ title = self._html_search_regex(
+ r'<div class="section">.*?<h3(?:\s+class="[^"]*")?>([^>]+?)</h3>',
+ webpage, 'title', flags=re.DOTALL)
+ video_url = self._search_regex(
+ [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'],
+ webpage, 'video url', flags=re.DOTALL)
+
return {
'id': video_id,
'title': os.path.splitext(title)[0],
diff --git a/youtube_dl/extractor/radiofrance.py b/youtube_dl/extractor/radiofrance.py
index 34652f6..09352ed 100644
--- a/youtube_dl/extractor/radiofrance.py
+++ b/youtube_dl/extractor/radiofrance.py
@@ -1,4 +1,6 @@
# coding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -6,16 +8,17 @@ from .common import InfoExtractor
class RadioFranceIE(InfoExtractor):
_VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
- IE_NAME = u'radiofrance'
+ IE_NAME = 'radiofrance'
_TEST = {
- u'url': u'http://maison.radiofrance.fr/radiovisions/one-one',
- u'file': u'one-one.ogg',
- u'md5': u'bdbb28ace95ed0e04faab32ba3160daf',
- u'info_dict': {
- u"title": u"One to one",
- u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
- u"uploader": u"Thomas Hercouët",
+ 'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
+ 'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
+ 'info_dict': {
+ 'id': 'one-one',
+ 'ext': 'ogg',
+ "title": "One to one",
+ "description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+ "uploader": "Thomas Hercouët",
},
}
@@ -24,27 +27,28 @@ class RadioFranceIE(InfoExtractor):
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title')
+ title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
- webpage, u'description', fatal=False)
+ webpage, 'description', fatal=False)
uploader = self._html_search_regex(
r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
- webpage, u'uploader', fatal=False)
+ webpage, 'uploader', fatal=False)
formats_str = self._html_search_regex(
r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
- webpage, u'audio URLs')
+ webpage, 'audio URLs')
formats = [
{
'format_id': fm[0],
'url': fm[1],
'vcodec': 'none',
+ 'preference': i,
}
- for fm in
- re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)
+ for i, fm in
+ enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
]
- # No sorting, we don't know any more about these formats
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py
index 4678f62..a6ad594 100644
--- a/youtube_dl/extractor/ro220.py
+++ b/youtube_dl/extractor/ro220.py
@@ -18,7 +18,7 @@ class Ro220IE(InfoExtractor):
'md5': '03af18b73a07b4088753930db7a34add',
'info_dict': {
"title": "Luati-le Banii sez 4 ep 1",
- "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
+ "description": "re:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$",
}
}
diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py
index d339e6c..41638c1 100644
--- a/youtube_dl/extractor/roxwel.py
+++ b/youtube_dl/extractor/roxwel.py
@@ -1,5 +1,6 @@
+from __future__ import unicode_literals
+
import re
-import json
from .common import InfoExtractor
from ..utils import unified_strdate, determine_ext
@@ -9,41 +10,44 @@ class RoxwelIE(InfoExtractor):
_VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
_TEST = {
- u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html',
- u'file': u'passionpittakeawalklive.flv',
- u'md5': u'd9dea8360a1e7d485d2206db7fe13035',
- u'info_dict': {
- u'title': u'Take A Walk (live)',
- u'uploader': u'Passion Pit',
- u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
+ 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html',
+ 'info_dict': {
+ 'id': 'passionpittakeawalklive',
+ 'ext': 'flv',
+ 'title': 'Take A Walk (live)',
+ 'uploader': 'Passion Pit',
+ 'uploader_id': 'passionpit',
+ 'upload_date': '20120928',
+ 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
},
- u'skip': u'Requires rtmpdump',
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
filename = mobj.group('filename')
info_url = 'http://www.roxwel.com/api/videos/%s' % filename
- info_page = self._download_webpage(info_url, filename,
- u'Downloading video info')
+ info = self._download_json(info_url, filename)
- self.report_extraction(filename)
- info = json.loads(info_page)
rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
best_rate = rtmp_rates[-1]
url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
- rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url')
+ rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url')
ext = determine_ext(rtmp_url)
if ext == 'f4v':
rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
- return {'id': filename,
- 'title': info['title'],
- 'url': rtmp_url,
- 'ext': 'flv',
- 'description': info['description'],
- 'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
- 'uploader': info['artist'],
- 'uploader_id': info['artistname'],
- 'upload_date': unified_strdate(info['dbdate']),
- }
+ return {
+ 'id': filename,
+ 'title': info['title'],
+ 'url': rtmp_url,
+ 'ext': 'flv',
+ 'description': info['description'],
+ 'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
+ 'uploader': info['artist'],
+ 'uploader_id': info['artistname'],
+ 'upload_date': unified_strdate(info['dbdate']),
+ }
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
new file mode 100644
index 0000000..205f8a1
--- /dev/null
+++ b/youtube_dl/extractor/rtbf.py
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class RTBFIE(InfoExtractor):
+ _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
+ 'md5': '799f334ddf2c0a582ba80c44655be570',
+ 'info_dict': {
+ 'id': '1921274',
+ 'ext': 'mp4',
+ 'title': 'Les Diables au coeur (épisode 2)',
+ 'description': 'Football - Diables Rouges',
+ 'duration': 3099,
+ 'timestamp': 1398456336,
+ 'upload_date': '20140425',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+
+ data = json.loads(self._html_search_regex(
+ r'<div class="js-player-embed" data-video="([^"]+)"', page, 'data video'))['data']
+
+ video_url = data.get('downloadUrl') or data.get('url')
+
+ if data['provider'].lower() == 'youtube':
+ return self.url_result(video_url, 'Youtube')
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': data['title'],
+ 'description': data.get('description') or data.get('subtitle'),
+ 'thumbnail': data['thumbnail']['large'],
+ 'duration': data.get('duration') or data.get('realDuration'),
+ 'timestamp': data['created'],
+ 'view_count': data['viewCount'],
+ }
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py
index cd50f70..4835ec5 100644
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -1,148 +1,165 @@
# encoding: utf-8
-
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
- clean_html,
ExtractorError,
+ clean_html,
+ unified_strdate,
+ int_or_none,
)
class RTLnowIE(InfoExtractor):
"""Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
- _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
- _TESTS = [{
- 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
- 'file': '90419.flv',
- 'info_dict': {
- 'upload_date': '20070416',
- 'title': 'Ahornallee - Folge 1 - Der Einzug',
- 'description': 'Folge 1 - Der Einzug',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
- 'file': '69756.flv',
- 'info_dict': {
- 'upload_date': '20120519',
- 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...',
- 'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
- 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Only works from Germany',
- },
- {
- 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
- 'file': '13883.flv',
- 'info_dict': {
- 'upload_date': '20090627',
- 'title': 'Voxtours - Südafrika-Reporter II',
- 'description': 'Südafrika-Reporter II',
- },
- 'params': {
- 'skip_download': True,
+ _VALID_URL = r'''(?x)
+ (?:https?://)?
+ (?P<url>
+ (?P<domain>
+ rtl-now\.rtl\.de|
+ rtl2now\.rtl2\.de|
+ (?:www\.)?voxnow\.de|
+ (?:www\.)?rtlnitronow\.de|
+ (?:www\.)?superrtlnow\.de|
+ (?:www\.)?n-tvnow\.de)
+ /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
+ (?:container_id|film_id)=(?P<video_id>[0-9]+)&
+ player=1(?:&season=[0-9]+)?(?:&.*)?
+ )'''
+
+ _TESTS = [
+ {
+ 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
+ 'info_dict': {
+ 'id': '90419',
+ 'ext': 'flv',
+ 'title': 'Ahornallee - Folge 1 - Der Einzug',
+ 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
+ 'upload_date': '20070416',
+ 'duration': 1685,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only works from Germany',
},
- },
- {
- 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
- 'file': '99205.flv',
- 'info_dict': {
- 'upload_date': '20080928',
- 'title': 'Medicopter 117 - Angst!',
- 'description': 'Angst!',
- 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
+ {
+ 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
+ 'info_dict': {
+ 'id': '69756',
+ 'ext': 'flv',
+ 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
+ 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
+ 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
+ 'upload_date': '20120519',
+ 'duration': 1245,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only works from Germany',
},
- 'params': {
- 'skip_download': True,
+ {
+ 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
+ 'info_dict': {
+ 'id': '13883',
+ 'ext': 'flv',
+ 'title': 'Voxtours - Südafrika-Reporter II',
+ 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
+ 'upload_date': '20090627',
+ 'duration': 1800,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
},
- },
- {
- 'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
- 'file': '124903.flv',
- 'info_dict': {
- 'upload_date': '20130101',
- 'title': 'Top Gear vom 01.01.2013',
- 'description': 'Episode 1',
+ {
+ 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
+ 'info_dict': {
+ 'id': '99205',
+ 'ext': 'flv',
+ 'title': 'Medicopter 117 - Angst!',
+ 'description': 'md5:895b1df01639b5f61a04fc305a5cb94d',
+ 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
+ 'upload_date': '20080928',
+ 'duration': 2691,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
},
- 'params': {
- 'skip_download': True,
+ {
+ 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
+ 'info_dict': {
+ 'id': '153819',
+ 'ext': 'flv',
+ 'title': 'Deluxe - Alles was Spaß macht - Thema u.a.: Luxushotel für Vierbeiner',
+ 'description': 'md5:c3705e1bb32e1a5b2bcd634fc065c631',
+ 'thumbnail': 'http://autoimg.static-fra.de/ntvnow/383157/1500x1500/image2.jpg',
+ 'upload_date': '20140221',
+ 'duration': 2429,
+ },
+ 'skip': 'Only works from Germany',
},
- 'skip': 'Only works from Germany',
- }]
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
- webpage_url = 'http://' + mobj.group('url')
- video_page_url = 'http://' + mobj.group('domain') + '/'
+ video_page_url = 'http://%s/' % mobj.group('domain')
video_id = mobj.group('video_id')
- webpage = self._download_webpage(webpage_url, video_id)
+ webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
- note_m = re.search(r'''(?sx)
- <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?)
- <div[ ]id="playerteaser">''', webpage)
- if note_m:
- msg = clean_html(note_m.group(1))
- raise ExtractorError(msg)
+ mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
+ if mobj:
+ raise ExtractorError(clean_html(mobj.group(1)), expected=True)
- video_title = self._html_search_regex(
- r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>',
- webpage, 'title')
- playerdata_url = self._html_search_regex(
- r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'',
- webpage, 'playerdata_url')
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
- playerdata = self._download_webpage(playerdata_url, video_id)
- mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]></title>', playerdata)
- if mobj:
- video_description = mobj.group('description')
- if mobj.group('upload_date_Y'):
- video_upload_date = mobj.group('upload_date_Y')
- elif mobj.group('upload_date_y'):
- video_upload_date = '20' + mobj.group('upload_date_y')
- else:
- video_upload_date = None
- if video_upload_date:
- video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d')
- else:
- video_description = None
- video_upload_date = None
- self._downloader.report_warning('Unable to extract description and upload date')
+ upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
- # Thumbnail: not every video has an thumbnail
- mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage)
- if mobj:
- video_thumbnail = mobj.group('thumbnail')
- else:
- video_thumbnail = None
+ mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
+ duration = int(mobj.group('seconds')) if mobj else None
- mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata)
- if mobj is None:
- raise ExtractorError('Unable to extract media URL')
- video_url = mobj.group('url')
- video_play_path = 'mp4:' + mobj.group('play_path')
- video_player_url = video_page_url + 'includes/vodplayer.swf'
+ playerdata_url = self._html_search_regex(
+ r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
+
+ playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
+
+ videoinfo = playerdata.find('./playlist/videoinfo')
+
+ formats = []
+ for filename in videoinfo.findall('filename'):
+ mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
+ if mobj:
+ fmt = {
+ 'url': mobj.group('url'),
+ 'play_path': 'mp4:' + mobj.group('play_path'),
+ 'page_url': video_page_url,
+ 'player_url': video_page_url + 'includes/vodplayer.swf',
+ }
+ else:
+ fmt = {
+ 'url': filename.text,
+ }
+ fmt.update({
+ 'width': int_or_none(filename.get('width')),
+ 'height': int_or_none(filename.get('height')),
+ 'vbr': int_or_none(filename.get('bitrate')),
+ 'ext': 'flv',
+ })
+ formats.append(fmt)
return {
'id': video_id,
- 'url': video_url,
- 'play_path': video_play_path,
- 'page_url': video_page_url,
- 'player_url': video_player_url,
- 'ext': 'flv',
- 'title': video_title,
- 'description': video_description,
- 'upload_date': video_upload_date,
- 'thumbnail': video_thumbnail,
- }
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
new file mode 100644
index 0000000..e8199b1
--- /dev/null
+++ b/youtube_dl/extractor/rts.py
@@ -0,0 +1,154 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ unescapeHTML,
+ compat_str,
+)
+
+
+class RTSIE(InfoExtractor):
+ IE_DESC = 'RTS.ch'
+ _VALID_URL = r'^https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-.*?\.html'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
+ 'md5': '753b877968ad8afaeddccc374d4256a5',
+ 'info_dict': {
+ 'id': '3449373',
+ 'ext': 'mp4',
+ 'duration': 1488,
+ 'title': 'Les Enfants Terribles',
+ 'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
+ 'uploader': 'Divers',
+ 'upload_date': '19680921',
+ 'timestamp': -40280400,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
+ 'md5': 'c148457a27bdc9e5b1ffe081a7a8337b',
+ 'info_dict': {
+ 'id': '5624067',
+ 'ext': 'mp4',
+ 'duration': 3720,
+ 'title': 'Les yeux dans les cieux - Mon homard au Canada',
+ 'description': 'md5:d22ee46f5cc5bac0912e5a0c6d44a9f7',
+ 'uploader': 'Passe-moi les jumelles',
+ 'upload_date': '20140404',
+ 'timestamp': 1396635300,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html',
+ 'md5': 'b4326fecd3eb64a458ba73c73e91299d',
+ 'info_dict': {
+ 'id': '5745975',
+ 'ext': 'mp4',
+ 'duration': 48,
+ 'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
+ 'description': 'Hockey - Playoff',
+ 'uploader': 'Hockey',
+ 'upload_date': '20140403',
+ 'timestamp': 1396556882,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ 'skip': 'Blocked outside Switzerland',
+ },
+ {
+ 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
+ 'md5': '9bb06503773c07ce83d3cbd793cebb91',
+ 'info_dict': {
+ 'id': '5745356',
+ 'ext': 'mp4',
+ 'duration': 33,
+ 'title': 'Londres cachée par un épais smog',
+ 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.',
+ 'uploader': 'Le Journal en continu',
+ 'upload_date': '20140403',
+ 'timestamp': 1396537322,
+ 'thumbnail': 're:^https?://.*\.image'
+ },
+ },
+ {
+ 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
+ 'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
+ 'info_dict': {
+ 'id': '5706148',
+ 'ext': 'mp3',
+ 'duration': 123,
+ 'title': '"Urban Hippie", de Damien Krisl',
+ 'description': 'Des Hippies super glam.',
+ 'upload_date': '20140403',
+ 'timestamp': 1396551600,
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ def download_json(internal_id):
+ return self._download_json(
+ 'http://www.rts.ch/a/%s.html?f=json/article' % internal_id,
+ video_id)
+
+ all_info = download_json(video_id)
+
+ # video_id extracted out of URL is not always a real id
+ if 'video' not in all_info and 'audio' not in all_info:
+ page = self._download_webpage(url, video_id)
+ internal_id = self._html_search_regex(
+ r'<(?:video|audio) data-id="([0-9]+)"', page,
+ 'internal video id')
+ all_info = download_json(internal_id)
+
+ info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
+
+ upload_timestamp = parse_iso8601(info.get('broadcast_date'))
+ duration = info.get('duration') or info.get('cutout') or info.get('cutduration')
+ if isinstance(duration, compat_str):
+ duration = parse_duration(duration)
+ view_count = info.get('plays')
+ thumbnail = unescapeHTML(info.get('preview_image_url'))
+
+ def extract_bitrate(url):
+ return int_or_none(self._search_regex(
+ r'-([0-9]+)k\.', url, 'bitrate', default=None))
+
+ formats = [{
+ 'format_id': fid,
+ 'url': furl,
+ 'tbr': extract_bitrate(furl),
+ } for fid, furl in info['streams'].items()]
+
+ if 'media' in info:
+ formats.extend([{
+ 'format_id': '%s-%sk' % (media['ext'], media['rate']),
+ 'url': 'http://download-video.rts.ch/%s' % media['url'],
+ 'tbr': media['rate'] or extract_bitrate(media['url']),
+ } for media in info['media'] if media.get('rate')])
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': info['title'],
+ 'description': info.get('intro'),
+ 'duration': duration,
+ 'view_count': view_count,
+ 'uploader': info.get('programName'),
+ 'timestamp': upload_timestamp,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
new file mode 100644
index 0000000..77fd08d
--- /dev/null
+++ b/youtube_dl/extractor/rtve.py
@@ -0,0 +1,84 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..utils import (
+ struct_unpack,
+)
+
+
+class RTVEALaCartaIE(InfoExtractor):
+ IE_NAME = 'rtve.es:alacarta'
+ IE_DESC = 'RTVE a la carta'
+ _VALID_URL = r'http://www\.rtve\.es/alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
+ 'md5': '18fcd45965bdd076efdb12cd7f6d7b9e',
+ 'info_dict': {
+ 'id': '2491869',
+ 'ext': 'mp4',
+ 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
+ },
+ }
+
+ def _decrypt_url(self, png):
+ encrypted_data = base64.b64decode(png)
+ text_index = encrypted_data.find(b'tEXt')
+ text_chunk = encrypted_data[text_index-4:]
+ length = struct_unpack('!I', text_chunk[:4])[0]
+ # Use bytearray to get integers when iterating in both python 2.x and 3.x
+ data = bytearray(text_chunk[8:8+length])
+ data = [chr(b) for b in data if b != 0]
+ hash_index = data.index('#')
+ alphabet_data = data[:hash_index]
+ url_data = data[hash_index+1:]
+
+ alphabet = []
+ e = 0
+ d = 0
+ for l in alphabet_data:
+ if d == 0:
+ alphabet.append(l)
+ d = e = (e + 1) % 4
+ else:
+ d -= 1
+ url = ''
+ f = 0
+ e = 3
+ b = 1
+ for letter in url_data:
+ if f == 0:
+ l = int(letter)*10
+ f = 1
+ else:
+ if e == 0:
+ l += int(letter)
+ url += alphabet[l]
+ e = (b + 3) % 4
+ f = 0
+ b += 1
+ else:
+ e -= 1
+
+ return url
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info = self._download_json(
+ 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
+ video_id)['page']['items'][0]
+ png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id
+ png = self._download_webpage(png_url, video_id, 'Downloading url information')
+ video_url = self._decrypt_url(png)
+
+ return {
+ 'id': video_id,
+ 'title': info['title'],
+ 'url': video_url,
+ 'thumbnail': info['image'],
+ }
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 4922dd7..357edbb 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals
import re
-import json
import itertools
from .common import InfoExtractor
@@ -20,8 +19,9 @@ class RutubeIE(InfoExtractor):
_TEST = {
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
- 'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4',
'info_dict': {
+ 'id': '3eac3b4561676c17df9132a9a1e62e3e',
+ 'ext': 'mp4',
'title': 'Раненный кенгуру забежал в аптеку',
'description': 'http://www.ntdtv.ru ',
'duration': 80,
@@ -38,18 +38,19 @@ class RutubeIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
-
- api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id,
- video_id, 'Downloading video JSON')
- video = json.loads(api_response)
-
- api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id,
- video_id, 'Downloading trackinfo JSON')
- trackinfo = json.loads(api_response)
-
+
+ video = self._download_json(
+ 'http://rutube.ru/api/video/%s/?format=json' % video_id,
+ video_id, 'Downloading video JSON')
+
# Some videos don't have the author field
- author = trackinfo.get('author') or {}
- m3u8_url = trackinfo['video_balancer'].get('m3u8')
+ author = video.get('author') or {}
+
+ options = self._download_json(
+ 'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
+ video_id, 'Downloading options JSON')
+
+ m3u8_url = options['video_balancer'].get('m3u8')
if m3u8_url is None:
raise ExtractorError('Couldn\'t find m3u8 manifest url')
@@ -79,10 +80,9 @@ class RutubeChannelIE(InfoExtractor):
def _extract_videos(self, channel_id, channel_title=None):
entries = []
for pagenum in itertools.count(1):
- api_response = self._download_webpage(
+ page = self._download_json(
self._PAGE_TEMPLATE % (channel_id, pagenum),
channel_id, 'Downloading page %s' % pagenum)
- page = json.loads(api_response)
results = page['results']
if not results:
break
@@ -108,10 +108,9 @@ class RutubeMovieIE(RutubeChannelIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
movie_id = mobj.group('id')
- api_response = self._download_webpage(
+ movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON')
- movie = json.loads(api_response)
movie_name = movie['name']
return self._extract_videos(movie_id, movie_name)
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
new file mode 100644
index 0000000..6c5f5a6
--- /dev/null
+++ b/youtube_dl/extractor/rutv.py
@@ -0,0 +1,194 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none
+)
+
+
+class RUTVIE(InfoExtractor):
+ IE_DESC = 'RUTV.RU'
+ _VALID_URL = r'''(?x)
+ https?://player\.(?:rutv\.ru|vgtrk\.com)/
+ (?P<path>flash2v/container\.swf\?id=
+ |iframe/(?P<type>swf|video|live)/id/
+ |index/iframe/cast_id/)
+ (?P<id>\d+)'''
+
+ _TESTS = [
+ {
+ 'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724',
+ 'info_dict': {
+ 'id': '774471',
+ 'ext': 'mp4',
+ 'title': 'Монологи на все времена',
+ 'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
+ 'duration': 2906,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638',
+ 'info_dict': {
+ 'id': '774016',
+ 'ext': 'mp4',
+ 'title': 'Чужой в семье Сталина',
+ 'description': '',
+ 'duration': 2539,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000',
+ 'info_dict': {
+ 'id': '766888',
+ 'ext': 'mp4',
+ 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+ 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+ 'duration': 279,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169',
+ 'info_dict': {
+ 'id': '771852',
+ 'ext': 'mp4',
+ 'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
+ 'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
+ 'duration': 3096,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014',
+ 'info_dict': {
+ 'id': '51499',
+ 'ext': 'flv',
+ 'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
+ 'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ 'skip': 'Translation has finished',
+ },
+ ]
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
+ mobj = re.search(
+ r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>http://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ video_path = mobj.group('path')
+
+ if video_path.startswith('flash2v'):
+ video_type = 'video'
+ elif video_path.startswith('iframe'):
+ video_type = mobj.group('type')
+ if video_type == 'swf':
+ video_type = 'video'
+ elif video_path.startswith('index/iframe/cast_id'):
+ video_type = 'live'
+
+ json_data = self._download_json(
+ 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
+ video_id, 'Downloading JSON')
+
+ if json_data['errors']:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True)
+
+ playlist = json_data['data']['playlist']
+ medialist = playlist['medialist']
+ media = medialist[0]
+
+ if media['errors']:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True)
+
+ view_count = playlist.get('count_views')
+ priority_transport = playlist['priority_transport']
+
+ thumbnail = media['picture']
+ width = int_or_none(media['width'])
+ height = int_or_none(media['height'])
+ description = media['anons']
+ title = media['title']
+ duration = int_or_none(media.get('duration'))
+
+ formats = []
+
+ for transport, links in media['sources'].items():
+ for quality, url in links.items():
+ if transport == 'rtmp':
+ mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
+ if not mobj:
+ continue
+ fmt = {
+ 'url': mobj.group('url'),
+ 'play_path': mobj.group('playpath'),
+ 'app': mobj.group('app'),
+ 'page_url': 'http://player.rutv.ru',
+ 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
+ 'rtmp_live': True,
+ 'ext': 'flv',
+ 'vbr': int(quality),
+ }
+ elif transport == 'm3u8':
+ fmt = {
+ 'url': url,
+ 'ext': 'mp4',
+ }
+ else:
+ fmt = {
+ 'url': url
+ }
+ fmt.update({
+ 'width': width,
+ 'height': height,
+ 'format_id': '%s-%s' % (transport, quality),
+ 'preference': -1 if priority_transport == transport else -2,
+ })
+ formats.append(fmt)
+
+ if not formats:
+ raise ExtractorError('No media links available for %s' % video_id)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'view_count': view_count,
+ 'duration': duration,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py
new file mode 100644
index 0000000..198a08c
--- /dev/null
+++ b/youtube_dl/extractor/savefrom.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os.path
+import re
+
+from .common import InfoExtractor
+
+
+class SaveFromIE(InfoExtractor):
+ IE_NAME = 'savefrom.net'
+ _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$'
+
+ _TEST = {
+ 'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com',
+ 'info_dict': {
+ 'id': 'UlVRAPW2WJY',
+ 'ext': 'mp4',
+ 'title': 'About Team Radical MMA | MMA Fighting',
+ 'upload_date': '20120816',
+ 'uploader': 'Howcast',
+ 'uploader_id': 'Howcast',
+ 'description': 'md5:4f0aac94361a12e1ce57d74f85265175',
+ },
+ 'params': {
+ 'skip_download': True
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = os.path.splitext(url.split('/')[-1])[0]
+ return {
+ '_type': 'url',
+ 'id': video_id,
+ 'url': mobj.group('url'),
+ }
diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py
new file mode 100644
index 0000000..55a481c
--- /dev/null
+++ b/youtube_dl/extractor/scivee.py
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class SciVeeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?scivee\.tv/node/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://www.scivee.tv/node/62352',
+ 'md5': 'b16699b74c9e6a120f6772a44960304f',
+ 'info_dict': {
+ 'id': '62352',
+ 'ext': 'mp4',
+ 'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting',
+ 'description': 'md5:81f1710638e11a481358fab1b11059d7',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ # annotations XML is malformed
+ annotations = self._download_webpage(
+ 'http://www.scivee.tv/assets/annotations/%s' % video_id, video_id, 'Downloading annotations')
+
+ title = self._html_search_regex(r'<title>([^<]+)</title>', annotations, 'title')
+ description = self._html_search_regex(r'<abstract>([^<]+)</abstract>', annotations, 'abstract', fatal=False)
+ filesize = int_or_none(self._html_search_regex(
+ r'<filesize>([^<]+)</filesize>', annotations, 'filesize', fatal=False))
+
+ formats = [
+ {
+ 'url': 'http://www.scivee.tv/assets/audio/%s' % video_id,
+ 'ext': 'mp3',
+ 'format_id': 'audio',
+ },
+ {
+ 'url': 'http://www.scivee.tv/assets/video/%s' % video_id,
+ 'ext': 'mp4',
+ 'format_id': 'video',
+ 'filesize': filesize,
+ },
+ ]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': 'http://www.scivee.tv/assets/videothumb/%s' % video_id,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py
deleted file mode 100644
index d68646d..0000000
--- a/youtube_dl/extractor/slashdot.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import re
-
-from .common import InfoExtractor
-
-
-class SlashdotIE(InfoExtractor):
- _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
-
- _TEST = {
- u'add_ie': ['Ooyala'],
- u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',
- u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',
- u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735',
- u'info_dict': {
- u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- webpage = self._download_webpage(url, video_id)
- ooyala_url = self._search_regex(r'<script src="(.*?)"', webpage, 'ooyala url')
- return self.url_result(ooyala_url, 'Ooyala')
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py
index 9c62825..53c3c92 100644
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@@ -39,7 +39,8 @@ class SlideshareIE(InfoExtractor):
ext = info['jsplayer']['video_extension']
video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
description = self._html_search_regex(
- r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
+ r'<p\s+(?:style="[^"]*"\s+)?class="description.*?"[^>]*>(.*?)</p>', webpage,
+ 'description', fatal=False)
return {
'_type': 'video',
diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py
new file mode 100644
index 0000000..ecc0abf
--- /dev/null
+++ b/youtube_dl/extractor/slutload.py
@@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+)
+
+
+class SlutloadIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$'
+ _TEST = {
+ 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
+ 'md5': '0cf531ae8006b530bd9df947a6a0df77',
+ 'info_dict': {
+ 'id': 'TD73btpBqSxc',
+ 'ext': 'mp4',
+ "title": "virginie baisee en cam",
+ "age_limit": 18,
+ 'thumbnail': 're:https?://.*?\.jpg'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>',
+ webpage, 'title').strip()
+
+ video_url = self._html_search_regex(
+ r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"',
+ webpage, 'video URL')
+ thumbnail = self._html_search_regex(
+ r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"',
+ webpage, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18
+ }
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 540c557..13e7e71 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -13,22 +13,24 @@ from ..utils import (
compat_urllib_request,
ExtractorError,
url_basename,
+ int_or_none,
)
class SmotriIE(InfoExtractor):
IE_DESC = 'Smotri.com'
IE_NAME = 'smotri'
- _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+ _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
_NETRC_MACHINE = 'smotri'
_TESTS = [
# real video id 2610366
{
'url': 'http://smotri.com/video/view/?id=v261036632ab',
- 'file': 'v261036632ab.mp4',
'md5': '2a7b08249e6f5636557579c368040eb9',
'info_dict': {
+ 'id': 'v261036632ab',
+ 'ext': 'mp4',
'title': 'катастрофа с камер видеонаблюдения',
'uploader': 'rbc2008',
'uploader_id': 'rbc08',
@@ -40,9 +42,10 @@ class SmotriIE(InfoExtractor):
# real video id 57591
{
'url': 'http://smotri.com/video/view/?id=v57591cb20',
- 'file': 'v57591cb20.flv',
'md5': '830266dfc21f077eac5afd1883091bcd',
'info_dict': {
+ 'id': 'v57591cb20',
+ 'ext': 'flv',
'title': 'test',
'uploader': 'Support Photofile@photofile',
'uploader_id': 'support-photofile',
@@ -54,9 +57,10 @@ class SmotriIE(InfoExtractor):
# video-password
{
'url': 'http://smotri.com/video/view/?id=v1390466a13c',
- 'file': 'v1390466a13c.mp4',
'md5': 'f6331cef33cad65a0815ee482a54440b',
'info_dict': {
+ 'id': 'v1390466a13c',
+ 'ext': 'mp4',
'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
'uploader': 'timoxa40',
'uploader_id': 'timoxa40',
@@ -71,9 +75,10 @@ class SmotriIE(InfoExtractor):
# age limit + video-password
{
'url': 'http://smotri.com/video/view/?id=v15408898bcf',
- 'file': 'v15408898bcf.flv',
'md5': '91e909c9f0521adf5ee86fbe073aad70',
'info_dict': {
+ 'id': 'v15408898bcf',
+ 'ext': 'flv',
'title': 'этот ролик не покажут по ТВ',
'uploader': 'zzxxx',
'uploader_id': 'ueggb',
@@ -85,7 +90,22 @@ class SmotriIE(InfoExtractor):
'params': {
'videopassword': '333'
}
- }
+ },
+ # swf player
+ {
+ 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
+ 'md5': '4d47034979d9390d14acdf59c4935bc2',
+ 'info_dict': {
+ 'id': 'v9188090500',
+ 'ext': 'mp4',
+ 'title': 'Shakira - Don\'t Bother',
+ 'uploader': 'HannahL',
+ 'uploader_id': 'lisaha95',
+ 'upload_date': '20090331',
+ 'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother',
+ 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
+ },
+ },
]
_SUCCESS = 0
@@ -93,6 +113,21 @@ class SmotriIE(InfoExtractor):
_PASSWORD_DETECTED = 2
_VIDEO_NOT_FOUND = 3
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(
+ r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
+ webpage)
+ if mobj is not None:
+ return mobj.group('url')
+
+ mobj = re.search(
+ r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
+ <div\s+class="video_image">[^<]+</div>\s*
+ <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
+ if mobj is not None:
+ return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
+
def _search_meta(self, name, html, display_name=None):
if display_name is None:
display_name = name
@@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor):
# Video JSON does not provide enough meta data
# We will extract some from the video web page instead
- video_page_url = 'http://' + mobj.group('url')
+ video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id
video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')
# Warning if video is unavailable
@@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor):
'upload_date': video_upload_date,
'uploader_id': video_uploader_id,
'duration': video_duration,
- 'view_count': video_view_count,
+ 'view_count': int_or_none(video_view_count),
'age_limit': 18 if adult_content else 0,
'video_page_url': video_page_url
}
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 393b5f1..25515f0 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -12,6 +12,7 @@ from ..utils import (
compat_urllib_parse,
ExtractorError,
+ int_or_none,
unified_strdate,
)
@@ -25,7 +26,7 @@ class SoundcloudIE(InfoExtractor):
of the stream token and uid
"""
- _VALID_URL = r'''^(?:https?://)?
+ _VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
(?P<uploader>[\w\d-]+)/
(?!sets/)(?P<title>[\w\d-]+)/?
@@ -44,7 +45,8 @@ class SoundcloudIE(InfoExtractor):
"upload_date": "20121011",
"description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
"uploader": "E.T. ExTerrestrial Music",
- "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
+ "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1",
+ "duration": 143,
}
},
# not streamable song
@@ -54,8 +56,10 @@ class SoundcloudIE(InfoExtractor):
'id': '47127627',
'ext': 'mp3',
'title': 'Goldrushed',
+ 'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
'upload_date': '20120521',
+ 'duration': 227,
},
'params': {
# rtmp
@@ -73,6 +77,7 @@ class SoundcloudIE(InfoExtractor):
'uploader': 'jaimeMF',
'description': 'test chars: \"\'/\\ä↭',
'upload_date': '20131209',
+ 'duration': 9,
},
},
# downloadable song
@@ -86,6 +91,7 @@ class SoundcloudIE(InfoExtractor):
'description': 'Vocals',
'uploader': 'Sim Gretina',
'upload_date': '20130815',
+ #'duration': 42,
},
},
]
@@ -93,13 +99,9 @@ class SoundcloudIE(InfoExtractor):
_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
- @classmethod
- def suitable(cls, url):
- return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
-
def report_resolve(self, video_id):
"""Report information extraction."""
- self.to_screen(u'%s: Resolving id' % video_id)
+ self.to_screen('%s: Resolving id' % video_id)
@classmethod
def _resolv_url(cls, url):
@@ -122,46 +124,47 @@ class SoundcloudIE(InfoExtractor):
'title': info['title'],
'description': info['description'],
'thumbnail': thumbnail,
+ 'duration': int_or_none(info.get('duration'), 1000),
}
+ formats = []
if info.get('downloadable', False):
# We can build a direct link to the song
format_url = (
'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
track_id, self._CLIENT_ID))
- result['formats'] = [{
+ formats.append({
'format_id': 'download',
'ext': info.get('original_format', 'mp3'),
'url': format_url,
'vcodec': 'none',
- }]
- else:
- # We have to retrieve the url
- streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
- 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
- stream_json = self._download_webpage(
- streams_url,
- track_id, 'Downloading track url')
-
- formats = []
- format_dict = json.loads(stream_json)
- for key, stream_url in format_dict.items():
- if key.startswith(u'http'):
- formats.append({
- 'format_id': key,
- 'ext': ext,
- 'url': stream_url,
- 'vcodec': 'none',
- })
- elif key.startswith(u'rtmp'):
- # The url doesn't have an rtmp app, we have to extract the playpath
- url, path = stream_url.split('mp3:', 1)
- formats.append({
- 'format_id': key,
- 'url': url,
- 'play_path': 'mp3:' + path,
- 'ext': ext,
- 'vcodec': 'none',
- })
+ 'preference': 10,
+ })
+
+ # We have to retrieve the url
+ streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
+ 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
+ format_dict = self._download_json(
+ streams_url,
+ track_id, 'Downloading track url')
+
+ for key, stream_url in format_dict.items():
+ if key.startswith('http'):
+ formats.append({
+ 'format_id': key,
+ 'ext': ext,
+ 'url': stream_url,
+ 'vcodec': 'none',
+ })
+ elif key.startswith('rtmp'):
+ # The url doesn't have an rtmp app, we have to extract the playpath
+ url, path = stream_url.split('mp3:', 1)
+ formats.append({
+ 'format_id': key,
+ 'url': url,
+ 'play_path': 'mp3:' + path,
+ 'ext': ext,
+ 'vcodec': 'none',
+ })
if not formats:
# We fallback to the stream_url in the original info, this
@@ -187,7 +190,7 @@ class SoundcloudIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
track_id = mobj.group('track_id')
token = None
@@ -196,7 +199,7 @@ class SoundcloudIE(InfoExtractor):
full_title = track_id
elif mobj.group('player'):
query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- return self.url_result(query['url'][0], ie='Soundcloud')
+ return self.url_result(query['url'][0])
else:
# extract uploader (which is in the url)
uploader = mobj.group('uploader')
@@ -211,13 +214,13 @@ class SoundcloudIE(InfoExtractor):
url = 'http://soundcloud.com/%s' % resolve_title
info_json_url = self._resolv_url(url)
- info_json = self._download_webpage(info_json_url, full_title, 'Downloading info JSON')
+ info = self._download_json(info_json_url, full_title, 'Downloading info JSON')
- info = json.loads(info_json)
return self._extract_info_dict(info, full_title, secret_token=token)
+
class SoundcloudSetIE(SoundcloudIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
+ _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
IE_NAME = 'soundcloud:set'
# it's in tests/test_playlists.py
_TESTS = []
@@ -225,24 +228,23 @@ class SoundcloudSetIE(SoundcloudIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
+ raise ExtractorError('Invalid URL: %s' % url)
# extract uploader (which is in the url)
uploader = mobj.group(1)
# extract simple title (uploader + slug of song title)
- slug_title = mobj.group(2)
+ slug_title = mobj.group(2)
full_title = '%s/sets/%s' % (uploader, slug_title)
self.report_resolve(full_title)
url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
resolv_url = self._resolv_url(url)
- info_json = self._download_webpage(resolv_url, full_title)
+ info = self._download_json(resolv_url, full_title)
- info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
- self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
+ self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
return
self.report_extraction(full_title)
@@ -266,26 +268,55 @@ class SoundcloudUserIE(SoundcloudIE):
url = 'http://soundcloud.com/%s/' % uploader
resolv_url = self._resolv_url(url)
- user_json = self._download_webpage(resolv_url, uploader,
- 'Downloading user info')
- user = json.loads(user_json)
+ user = self._download_json(
+ resolv_url, uploader, 'Downloading user info')
+ base_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % uploader
- tracks = []
+ entries = []
for i in itertools.count():
- data = compat_urllib_parse.urlencode({'offset': i*50,
- 'client_id': self._CLIENT_ID,
- })
- tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
- response = self._download_webpage(tracks_url, uploader,
- 'Downloading tracks page %s' % (i+1))
- new_tracks = json.loads(response)
- tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
- if len(new_tracks) < 50:
+ data = compat_urllib_parse.urlencode({
+ 'offset': i * 50,
+ 'client_id': self._CLIENT_ID,
+ })
+ new_entries = self._download_json(
+ base_url + data, uploader, 'Downloading track page %s' % (i + 1))
+ entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)
+ if len(new_entries) < 50:
break
return {
'_type': 'playlist',
'id': compat_str(user['id']),
'title': user['username'],
- 'entries': tracks,
+ 'entries': entries,
+ }
+
+
+class SoundcloudPlaylistIE(SoundcloudIE):
+ _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)'
+ IE_NAME = 'soundcloud:playlist'
+
+ # it's in tests/test_playlists.py
+ _TESTS = []
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id)
+
+ data = compat_urllib_parse.urlencode({
+ 'client_id': self._CLIENT_ID,
+ })
+ data = self._download_json(
+ base_url + data, playlist_id, 'Downloading playlist')
+
+ entries = [
+ self._extract_info_dict(t, quiet=True) for t in data['tracks']]
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': data.get('title'),
+ 'description': data.get('description'),
+ 'entries': entries,
}
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
index 4a3e52a..d34aefe 100644
--- a/youtube_dl/extractor/space.py
+++ b/youtube_dl/extractor/space.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -8,14 +10,14 @@ from ..utils import RegexNotFoundError, ExtractorError
class SpaceIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
_TEST = {
- u'add_ie': ['Brightcove'],
- u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
- u'info_dict': {
- u'id': u'2780937028001',
- u'ext': u'mp4',
- u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
- u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61',
- u'uploader': u'TechMedia Networks',
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
+ 'info_dict': {
+ 'id': '2780937028001',
+ 'ext': 'mp4',
+ 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
+ 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
+ 'uploader': 'TechMedia Networks',
},
}
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 3362b3d..2007a00 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import os
import re
from .common import InfoExtractor
@@ -8,23 +7,27 @@ from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
+ unified_strdate,
+ str_to_int,
+ int_or_none,
)
-from ..aes import (
- aes_decrypt_text
-)
+from ..aes import aes_decrypt_text
class SpankwireIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
_TEST = {
'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
- 'file': '103545.mp4',
- 'md5': '1b3f55e345500552dbc252a3e9c1af43',
+ 'md5': '8bbfde12b101204b39e4b9fe7eb67095',
'info_dict': {
- "uploader": "oreusz",
- "title": "Buckcherry`s X Rated Music Video Crazy Bitch",
- "description": "Crazy Bitch X rated music video.",
- "age_limit": 18,
+ 'id': '103545',
+ 'ext': 'mp4',
+ 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
+ 'description': 'Crazy Bitch X rated music video.',
+ 'uploader': 'oreusz',
+ 'uploader_id': '124697',
+ 'upload_date': '20070508',
+ 'age_limit': 18,
}
}
@@ -37,13 +40,26 @@ class SpankwireIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
- video_uploader = self._html_search_regex(
- r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
- thumbnail = self._html_search_regex(
- r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
+ title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
description = self._html_search_regex(
r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
+
+ uploader = self._html_search_regex(
+ r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
+ uploader_id = self._html_search_regex(
+ r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False)
+ upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False)
+ if upload_date:
+ upload_date = unified_strdate(upload_date)
+
+ view_count = self._html_search_regex(
+ r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = str_to_int(view_count)
+ comment_count = int_or_none(self._html_search_regex(
+ r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False))
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
@@ -53,16 +69,13 @@ class SpankwireIE(InfoExtractor):
formats = []
for video_url in video_urls:
path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
format = path.split('/')[4].split('_')[:2]
resolution, bitrate_str = format
format = "-".join(format)
- height = int(resolution.rstrip('P'))
- tbr = int(bitrate_str.rstrip('K'))
-
+ height = int(resolution.rstrip('Pp'))
+ tbr = int(bitrate_str.rstrip('Kk'))
formats.append({
'url': video_url,
- 'ext': extension,
'resolution': resolution,
'format': format,
'tbr': tbr,
@@ -75,10 +88,14 @@ class SpankwireIE(InfoExtractor):
return {
'id': video_id,
- 'uploader': video_uploader,
- 'title': video_title,
- 'thumbnail': thumbnail,
+ 'title': title,
'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'upload_date': upload_date,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
'formats': formats,
'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
new file mode 100644
index 0000000..7f388ac
--- /dev/null
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+
+
+class SpiegeltvIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/filme/(?P<id>[\-a-z0-9]+)'
+ _TEST = {
+ 'url': 'http://www.spiegel.tv/filme/flug-mh370/',
+ 'info_dict': {
+ 'id': 'flug-mh370',
+ 'ext': 'm4v',
+ 'title': 'Flug MH370',
+ 'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines',
+ 'thumbnail': 're:http://.*\.jpg$',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title')
+
+ apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com'
+ version_json = self._download_json(
+ '%s/version.json' % apihost, video_id,
+ note='Downloading version information')
+ version_name = version_json['version_name']
+
+ slug_json = self._download_json(
+ '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id),
+ video_id,
+ note='Downloading object information')
+ oid = slug_json['object_id']
+
+ media_json = self._download_json(
+ '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid),
+ video_id, note='Downloading media information')
+ uuid = media_json['uuid']
+ is_wide = media_json['is_wide']
+
+ server_json = self._download_json(
+ 'http://www.spiegel.tv/streaming_servers/', video_id,
+ note='Downloading server information')
+ server = server_json[0]['endpoint']
+
+ thumbnails = []
+ for image in media_json['images']:
+ thumbnails.append({
+ 'url': image['url'],
+ 'width': image['width'],
+ 'height': image['height'],
+ })
+
+ description = media_json['subtitle']
+ duration = media_json['duration_in_ms'] / 1000.
+
+ if is_wide:
+ format = '16x9'
+ else:
+ format = '4x3'
+
+ url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': url,
+ 'ext': 'm4v',
+ 'description': description,
+ 'duration': duration,
+ 'thumbnails': thumbnails
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py
index 56682ac..a3adf54 100644
--- a/youtube_dl/extractor/spike.py
+++ b/youtube_dl/extractor/spike.py
@@ -1,10 +1,15 @@
from __future__ import unicode_literals
+import re
+
from .mtv import MTVServicesInfoExtractor
class SpikeIE(MTVServicesInfoExtractor):
- _VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+'
+ _VALID_URL = r'''(?x)https?://
+ (www\.spike\.com/(video-clips|episodes)/.+|
+ m\.spike\.com/videos/video.rbml\?id=(?P<mobile_id>[^&]+))
+ '''
_TEST = {
'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle',
'md5': '1a9265f32b0c375793d6c4ce45255256',
@@ -17,3 +22,11 @@ class SpikeIE(MTVServicesInfoExtractor):
}
_FEED_URL = 'http://www.spike.com/feeds/mrss/'
+ _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s'
+
+ def _real_extract(self, url):
+ mobj = re.search(self._VALID_URL, url)
+ mobile_id = mobj.group('mobile_id')
+ if mobile_id is not None:
+ url = 'http://www.spike.com/video-clips/%s' % mobile_id
+ return super(SpikeIE, self)._real_extract(url)
diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py
index 91658f8..1d8d572 100644
--- a/youtube_dl/extractor/steam.py
+++ b/youtube_dl/extractor/steam.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -8,78 +10,114 @@ from ..utils import (
class SteamIE(InfoExtractor):
- _VALID_URL = r"""http://store\.steampowered\.com/
- (agecheck/)?
- (?P<urltype>video|app)/ #If the page is only for videos or for a game
- (?P<gameID>\d+)/?
- (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
- """
+ _VALID_URL = r"""(?x)
+ https?://store\.steampowered\.com/
+ (agecheck/)?
+ (?P<urltype>video|app)/ #If the page is only for videos or for a game
+ (?P<gameID>\d+)/?
+ (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID
+ |
+ https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+)
+ """
_VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
_AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
- _TEST = {
- u"url": u"http://store.steampowered.com/video/105600/",
- u"playlist": [
+ _TESTS = [{
+ "url": "http://store.steampowered.com/video/105600/",
+ "playlist": [
{
- u"file": u"81300.flv",
- u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
- u"info_dict": {
- u"title": u"Terraria 1.1 Trailer",
- u'playlist_index': 1,
+ "md5": "f870007cee7065d7c76b88f0a45ecc07",
+ "info_dict": {
+ 'id': '81300',
+ 'ext': 'flv',
+ "title": "Terraria 1.1 Trailer",
+ 'playlist_index': 1,
}
},
{
- u"file": u"80859.flv",
- u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
- u"info_dict": {
- u"title": u"Terraria Trailer",
- u'playlist_index': 2,
+ "md5": "61aaf31a5c5c3041afb58fb83cbb5751",
+ "info_dict": {
+ 'id': '80859',
+ 'ext': 'flv',
+ "title": "Terraria Trailer",
+ 'playlist_index': 2,
}
}
- ]
- }
-
-
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ ],
+ 'params': {
+ 'playlistend': 2,
+ }
+ }, {
+ 'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205',
+ 'info_dict': {
+ 'id': 'WB5DvDOOvAY',
+ 'ext': 'mp4',
+ 'upload_date': '20140329',
+ 'title': 'FRONTIERS - Final Greenlight Trailer',
+ 'description': "The final trailer for the Steam Greenlight launch. Hooray, progress! Here's the official Greenlight page: http://steamcommunity.com/sharedfiles/filedetails/?id=242472205",
+ 'uploader': 'AAD Productions',
+ 'uploader_id': 'AtomicAgeDogGames',
+ }
+ }]
def _real_extract(self, url):
- m = re.match(self._VALID_URL, url, re.VERBOSE)
- gameID = m.group('gameID')
-
- videourl = self._VIDEO_PAGE_TEMPLATE % gameID
- webpage = self._download_webpage(videourl, gameID)
+ m = re.match(self._VALID_URL, url)
+ fileID = m.group('fileID')
+ if fileID:
+ videourl = url
+ playlist_id = fileID
+ else:
+ gameID = m.group('gameID')
+ playlist_id = gameID
+ videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id
+ webpage = self._download_webpage(videourl, playlist_id)
if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
- videourl = self._AGECHECK_TEMPLATE % gameID
+ videourl = self._AGECHECK_TEMPLATE % playlist_id
self.report_age_confirmation()
- webpage = self._download_webpage(videourl, gameID)
+ webpage = self._download_webpage(videourl, playlist_id)
+
+ if fileID:
+ playlist_title = self._html_search_regex(
+ r'<div class="workshopItemTitle">(.+)</div>', webpage, 'title')
+ mweb = re.finditer(r'''(?x)
+ 'movie_(?P<videoID>[0-9]+)':\s*\{\s*
+ YOUTUBE_VIDEO_ID:\s*"(?P<youtube_id>[^"]+)",
+ ''', webpage)
+ videos = [{
+ '_type': 'url',
+ 'url': vid.group('youtube_id'),
+ 'ie_key': 'Youtube',
+ } for vid in mweb]
+ else:
+ playlist_title = self._html_search_regex(
+ r'<h2 class="pageheader">(.*?)</h2>', webpage, 'game title')
+
+ mweb = re.finditer(r'''(?x)
+ 'movie_(?P<videoID>[0-9]+)':\s*\{\s*
+ FILENAME:\s*"(?P<videoURL>[\w:/\.\?=]+)"
+ (,\s*MOVIE_NAME:\s*\"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},
+ ''', webpage)
+ titles = re.finditer(
+ r'<span class="title">(?P<videoName>.+?)</span>', webpage)
+ thumbs = re.finditer(
+ r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">', webpage)
+ videos = []
- self.report_extraction(gameID)
- game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
- webpage, 'game title')
+ for vid, vtitle, thumb in zip(mweb, titles, thumbs):
+ video_id = vid.group('videoID')
+ title = vtitle.group('videoName')
+ video_url = vid.group('videoURL')
+ video_thumb = thumb.group('thumbnail')
+ if not video_url:
+ raise ExtractorError('Cannot find video url for %s' % video_id)
+ videos.append({
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'flv',
+ 'title': unescapeHTML(title),
+ 'thumbnail': video_thumb
+ })
+ if not videos:
+ raise ExtractorError('Could not find any videos')
- urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
- mweb = re.finditer(urlRE, webpage)
- namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
- titles = re.finditer(namesRE, webpage)
- thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
- thumbs = re.finditer(thumbsRE, webpage)
- videos = []
- for vid,vtitle,thumb in zip(mweb,titles,thumbs):
- video_id = vid.group('videoID')
- title = vtitle.group('videoName')
- video_url = vid.group('videoURL')
- video_thumb = thumb.group('thumbnail')
- if not video_url:
- raise ExtractorError(u'Cannot find video url for %s' % video_id)
- info = {
- 'id':video_id,
- 'url':video_url,
- 'ext': 'flv',
- 'title': unescapeHTML(title),
- 'thumbnail': video_thumb
- }
- videos.append(info)
- return [self.playlist_result(videos, gameID, game_title)]
+ return self.playlist_result(videos, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py
index 7362904..73efe95 100644
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@@ -5,13 +5,16 @@ import re
import json
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ compat_str,
+)
class StreamCZIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',
'info_dict': {
@@ -22,7 +25,18 @@ class StreamCZIE(InfoExtractor):
'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
'duration': 256,
},
- }
+ }, {
+ 'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
+ 'md5': '246272e753e26bbace7fcd9deca0650c',
+ 'info_dict': {
+ 'id': '10002447',
+ 'ext': 'mp4',
+ 'title': 'Kancelář Blaník: Tři roky pro Mazánka',
+ 'description': 'md5:9177695a8b756a0a8ab160de4043b392',
+ 'thumbnail': 'http://im.stream.cz/episode/537f838c50c11f8d21320000',
+ 'duration': 368,
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -57,7 +71,7 @@ class StreamCZIE(InfoExtractor):
self._sort_formats(formats)
return {
- 'id': str(jsonData['id']),
+ 'id': compat_str(jsonData['episode_id']),
'title': self._og_search_title(webpage),
'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),
'formats': formats,
diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py
new file mode 100644
index 0000000..6c688c5
--- /dev/null
+++ b/youtube_dl/extractor/swrmediathek.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class SWRMediathekIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+ _TESTS = [{
+ 'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6',
+ 'md5': '8c5f6f0172753368547ca8413a7768ac',
+ 'info_dict': {
+ 'id': '849790d0-dab8-11e3-a953-0026b975f2e6',
+ 'ext': 'mp4',
+ 'title': 'SWR odysso',
+ 'description': 'md5:2012e31baad36162e97ce9eb3f157b8a',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ 'duration': 2602,
+ 'upload_date': '20140515',
+ 'uploader': 'SWR Fernsehen',
+ 'uploader_id': '990030',
+ },
+ }, {
+ 'url': 'http://swrmediathek.de/player.htm?show=0e1a8510-ddf2-11e3-9be3-0026b975f2e6',
+ 'md5': 'b10ab854f912eecc5a6b55cd6fc1f545',
+ 'info_dict': {
+ 'id': '0e1a8510-ddf2-11e3-9be3-0026b975f2e6',
+ 'ext': 'mp4',
+ 'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen',
+ 'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'duration': 5305,
+ 'upload_date': '20140516',
+ 'uploader': 'SWR Fernsehen',
+ 'uploader_id': '990030',
+ },
+ }, {
+ 'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6',
+ 'md5': '4382e4ef2c9d7ce6852535fa867a0dd3',
+ 'info_dict': {
+ 'id': 'bba23e10-cb93-11e3-bf7f-0026b975f2e6',
+ 'ext': 'mp3',
+ 'title': 'Saša Stanišic: Vor dem Fest',
+ 'description': 'md5:5b792387dc3fbb171eb709060654e8c9',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'duration': 3366,
+ 'upload_date': '20140520',
+ 'uploader': 'SWR 2',
+ 'uploader_id': '284670',
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ video = self._download_json(
+ 'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, video_id, 'Downloading video JSON')
+
+ attr = video['attr']
+ media_type = attr['entry_etype']
+
+ formats = []
+ for entry in video['sub']:
+ if entry['name'] != 'entry_media':
+ continue
+
+ entry_attr = entry['attr']
+ codec = entry_attr['val0']
+ quality = int(entry_attr['val1'])
+
+ fmt = {
+ 'url': entry_attr['val2'],
+ 'quality': quality,
+ }
+
+ if media_type == 'Video':
+ fmt.update({
+ 'format_note': ['144p', '288p', '544p'][quality-1],
+ 'vcodec': codec,
+ })
+ elif media_type == 'Audio':
+ fmt.update({
+ 'acodec': codec,
+ })
+ formats.append(fmt)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': attr['entry_title'],
+ 'description': attr['entry_descl'],
+ 'thumbnail': attr['entry_image_16_9'],
+ 'duration': parse_duration(attr['entry_durat']),
+ 'upload_date': attr['entry_pdatet'][:-4],
+ 'uploader': attr['channel_title'],
+ 'uploader_id': attr['channel_idkey'],
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py
index 8809a57..f76b6e2 100644
--- a/youtube_dl/extractor/syfy.py
+++ b/youtube_dl/extractor/syfy.py
@@ -6,9 +6,9 @@ from .common import InfoExtractor
class SyfyIE(InfoExtractor):
- _VALID_URL = r'https?://www\.syfy\.com/videos/.+?vid:(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.syfy\.com/(?:videos/.+?vid:(?P<id>[0-9]+)|(?!videos)(?P<video_name>[^/]+)(?:$|[?#]))'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458',
'md5': 'e07de1d52c7278adbb9b9b1c93a66849',
'info_dict': {
@@ -18,10 +18,30 @@ class SyfyIE(InfoExtractor):
'description': 'Listen to what insights George Lucas give his daughter Amanda.',
},
'add_ie': ['ThePlatform'],
- }
+ }, {
+ 'url': 'http://www.syfy.com/wilwheaton',
+ 'md5': '94dfa54ee3ccb63295b276da08c415f6',
+ 'info_dict': {
+ 'id': '4yoffOOXC767',
+ 'ext': 'flv',
+ 'title': 'The Wil Wheaton Project - Premiering May 27th at 10/9c.',
+ 'description': 'The Wil Wheaton Project premieres May 27th at 10/9c. Don\'t miss it.',
+ },
+ 'add_ie': ['ThePlatform'],
+ 'skip': 'Blocked outside the US',
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_name = mobj.group('video_name')
+ if video_name:
+ generic_webpage = self._download_webpage(url, video_name)
+ video_id = self._search_regex(
+ r'<iframe.*?class="video_iframe_page"\s+src="/_utils/video/thP_video_controller.php.*?_vid([0-9]+)">',
+ generic_webpage, 'video ID')
+ url = 'http://www.syfy.com/videos/%s/%s/vid:%s' % (
+ video_name, video_name, video_id)
+ else:
+ video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
return self.url_result(self._og_search_video_url(webpage))
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
new file mode 100644
index 0000000..3633152
--- /dev/null
+++ b/youtube_dl/extractor/tagesschau.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TagesschauIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html'
+
+ _TESTS = [{
+ 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
+ 'md5': 'bcdeac2194fb296d599ce7929dfa4009',
+ 'info_dict': {
+ 'id': '1399128',
+ 'ext': 'mp4',
+ 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
+ 'description': 'md5:69da3c61275b426426d711bde96463ab',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ },
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-196.html',
+ 'md5': '8aaa8bf3ae1ca2652309718c03019128',
+ 'info_dict': {
+ 'id': '196',
+ 'ext': 'mp4',
+ 'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt',
+ 'description': 'md5:f22e4af75821d174fa6c977349682691',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }]
+
+ _FORMATS = {
+ 's': {'width': 256, 'height': 144, 'quality': 1},
+ 'm': {'width': 512, 'height': 288, 'quality': 2},
+ 'l': {'width': 960, 'height': 544, 'quality': 3},
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ if video_id.startswith('-'):
+ display_id = video_id.strip('-')
+ else:
+ display_id = video_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ playerpage = self._download_webpage(
+ 'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id,
+ display_id, 'Downloading player page')
+
+ medias = re.findall(
+ r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
+ playerpage)
+
+ formats = []
+ for url, ext, res in medias:
+ f = {
+ 'format_id': res + '_' + ext,
+ 'url': url,
+ 'ext': ext,
+ }
+ f.update(self._FORMATS.get(res, {}))
+ formats.append(f)
+
+ self._sort_formats(formats)
+
+ thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+
+ return {
+ 'id': display_id,
+ 'title': self._og_search_title(webpage).strip(),
+ 'thumbnail': 'http://www.tagesschau.de' + thumbnail,
+ 'formats': formats,
+ 'description': self._og_search_description(webpage).strip(),
+ }
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py
new file mode 100644
index 0000000..6d52763
--- /dev/null
+++ b/youtube_dl/extractor/teachertube.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ qualities,
+ determine_ext,
+)
+
+
+class TeacherTubeIE(InfoExtractor):
+ IE_NAME = 'teachertube'
+ IE_DESC = 'teachertube.com videos'
+
+ _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997',
+ 'md5': 'f9434ef992fd65936d72999951ee254c',
+ 'info_dict': {
+ 'id': '339997',
+ 'ext': 'mp4',
+ 'title': 'Measures of dispersion from a frequency table_x264',
+ 'description': 'md5:a3e9853487185e9fcd7181a07164650b',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064',
+ 'md5': '0d625ec6bc9bf50f70170942ad580676',
+ 'info_dict': {
+ 'id': '340064',
+ 'ext': 'mp4',
+ 'title': 'How to Make Paper Dolls _ Paper Art Projects',
+ 'description': 'md5:2ca52b20cd727773d1dc418b3d6bd07b',
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }, {
+ 'url': 'http://www.teachertube.com/music.php?music_id=8805',
+ 'md5': '01e8352006c65757caf7b961f6050e21',
+ 'info_dict': {
+ 'id': '8805',
+ 'ext': 'mp3',
+ 'title': 'PER ASPERA AD ASTRA',
+ 'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNIČKE ŠKOLE PER ASPERA AD ASTRA',
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ quality = qualities(['mp3', 'flv', 'mp4'])
+
+ formats = [
+ {
+ 'url': media_url,
+ 'quality': quality(determine_ext(media_url))
+ } for media_url in set(zip(*re.findall(r'([\'"])file\1\s*:\s*"([^"]+)"', webpage))[1])
+ ]
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'formats': formats,
+ 'description': self._og_search_description(webpage),
+ }
+
+
+class TeacherTubeClassroomIE(InfoExtractor):
+ IE_NAME = 'teachertube:classroom'
+ IE_DESC = 'teachertube.com online classrooms'
+
+ _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('user')
+
+ rss = self._download_xml('http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id,
+ user_id, 'Downloading classroom RSS')
+
+ entries = []
+ for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'):
+ entries.append(self.url_result(url.attrib['url'], 'TeacherTube'))
+
+ return self.playlist_result(entries, user_id)
diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py
new file mode 100644
index 0000000..117afa9
--- /dev/null
+++ b/youtube_dl/extractor/teachingchannel.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TeachingChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P<title>.+)'
+
+ _TEST = {
+ 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
+ 'info_dict': {
+ 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
+ 'ext': 'mp4',
+ 'title': 'A History of Teaming',
+ 'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ ooyala_code = self._search_regex(
+ r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code')
+
+ return OoyalaIE._build_url_result(ooyala_code)
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index 9dcffea..f8dd7e9 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -3,14 +3,21 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
-)
class TeamcocoIE(InfoExtractor):
- _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
- _TEST = {
+ _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)'
+ _TESTS = [
+ {
+ 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
+ 'file': '80187.mp4',
+ 'md5': '3f7746aa0dc86de18df7539903d399ea',
+ 'info_dict': {
+ 'title': 'Conan Becomes A Mary Kay Beauty Consultant',
+ 'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'
+ }
+ },
+ {
'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
'file': '19705.mp4',
'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
@@ -19,22 +26,23 @@ class TeamcocoIE(InfoExtractor):
"title": "Louis C.K. Interview Pt. 1 11/3/11"
}
}
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
- url_title = mobj.group('url_title')
- webpage = self._download_webpage(url, url_title)
-
- video_id = self._html_search_regex(
- r'<article class="video" data-id="(\d+?)"',
- webpage, 'video id')
- self.report_extraction(video_id)
+ display_id = mobj.group('display_id')
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = mobj.group("video_id")
+ if not video_id:
+ video_id = self._html_search_regex(
+ r'<article class="video" data-id="(\d+?)"',
+ webpage, 'video id')
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
- data = self._download_xml(data_url, video_id, 'Downloading data webpage')
+ data = self._download_xml(
+ data_url, display_id, 'Downloading data webpage')
qualities = ['500k', '480p', '1000k', '720p', '1080p']
formats = []
@@ -69,6 +77,7 @@ class TeamcocoIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'formats': formats,
'title': self._og_search_title(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 8b31caa..d260c91 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -6,115 +6,185 @@ import re
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
- RegexNotFoundError,
+ compat_str,
)
class TEDIE(SubtitlesInfoExtractor):
- _VALID_URL=r'''http://www\.ted\.com/
- (
- ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
- |
- ((?P<type_talk>talks)) # We have a simple talk
- )
- (/lang/(.*?))? # The url may contain the language
- /(?P<name>\w+) # Here goes the name and then ".html"
- '''
- _TEST = {
+ _VALID_URL = r'''(?x)
+ (?P<proto>https?://)
+ (?P<type>www|embed)(?P<urlmain>\.ted\.com/
+ (
+ (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
+ |
+ ((?P<type_talk>talks)) # We have a simple talk
+ |
+ (?P<type_watch>watch)/[^/]+/[^/]+
+ )
+ (/lang/(.*?))? # The url may contain the language
+ /(?P<name>[\w-]+) # Here goes the name and then ".html"
+ .*)$
+ '''
+ _TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
- 'file': '102.mp4',
'md5': '4ea1dada91e4174b53dac2bb8ace429d',
'info_dict': {
- "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
- "title": "Dan Dennett: The illusion of consciousness"
+ 'id': '102',
+ 'ext': 'mp4',
+ 'title': 'The illusion of consciousness',
+ 'description': ('Philosopher Dan Dennett makes a compelling '
+ 'argument that not only don\'t we understand our own '
+ 'consciousness, but that half the time our brains are '
+ 'actively fooling us.'),
+ 'uploader': 'Dan Dennett',
+ 'width': 854,
}
+ }, {
+ 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
+ 'md5': '226f4fb9c62380d11b7995efa4c87994',
+ 'info_dict': {
+ 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
+ 'ext': 'mp4',
+ 'title': 'Vishal Sikka: The beauty and power of algorithms',
+ 'thumbnail': 're:^https?://.+\.jpg',
+ 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
+ }
+ }, {
+ 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
+ 'md5': '49144e345a899b8cb34d315f3b9cfeeb',
+ 'info_dict': {
+ 'id': '1972',
+ 'ext': 'mp4',
+ 'title': 'Be passionate. Be courageous. Be your best.',
+ 'uploader': 'Gabby Giffords and Mark Kelly',
+ 'description': 'md5:5174aed4d0f16021b704120360f72b92',
+ },
+ }]
+
+ _NATIVE_FORMATS = {
+ 'low': {'preference': 1, 'width': 320, 'height': 180},
+ 'medium': {'preference': 2, 'width': 512, 'height': 288},
+ 'high': {'preference': 3, 'width': 854, 'height': 480},
}
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ def _extract_info(self, webpage):
+ info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
+ webpage, 'info json')
+ return json.loads(info_json)
def _real_extract(self, url):
- m=re.match(self._VALID_URL, url, re.VERBOSE)
+ m = re.match(self._VALID_URL, url, re.VERBOSE)
+ if m.group('type') == 'embed':
+ desktop_url = m.group('proto') + 'www' + m.group('urlmain')
+ return self.url_result(desktop_url, 'TED')
+ name = m.group('name')
if m.group('type_talk'):
- return self._talk_info(url)
- else :
- playlist_id=m.group('playlist_id')
- name=m.group('name')
- self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
- return [self._playlist_videos_info(url,name,playlist_id)]
-
+ return self._talk_info(url, name)
+ elif m.group('type_watch'):
+ return self._watch_info(url, name)
+ else:
+ return self._playlist_videos_info(url, name)
- def _playlist_videos_info(self, url, name, playlist_id):
+ def _playlist_videos_info(self, url, name):
'''Returns the videos of the playlist'''
- webpage = self._download_webpage(
- url, playlist_id, 'Downloading playlist webpage')
- matches = re.finditer(
- r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
- webpage)
-
- playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
- webpage, 'playlist title')
+ webpage = self._download_webpage(url, name,
+ 'Downloading playlist webpage')
+ info = self._extract_info(webpage)
+ playlist_info = info['playlist']
playlist_entries = [
- self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
- for m in matches
+ self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
+ for talk in info['talks']
]
return self.playlist_result(
- playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
+ playlist_entries,
+ playlist_id=compat_str(playlist_info['id']),
+ playlist_title=playlist_info['title'])
- def _talk_info(self, url, video_id=0):
- """Return the video for the talk in the url"""
- m = re.match(self._VALID_URL, url,re.VERBOSE)
- video_name = m.group('name')
- webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+ def _talk_info(self, url, video_name):
+ webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
- # If the url includes the language we get the title translated
- title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
- webpage, 'title')
- json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
- webpage, 'json data')
- info = json.loads(json_data)
- desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
- webpage, 'description', flags = re.DOTALL)
-
- thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
- webpage, 'thumbnail')
- formats = [{
- 'ext': 'mp4',
- 'url': stream['file'],
- 'format': stream['id']
- } for stream in info['htmlStreams']]
- video_id = info['id']
+ talk_info = self._extract_info(webpage)['talks'][0]
+ formats = [{
+ 'url': format_url,
+ 'format_id': format_id,
+ 'format': format_id,
+ } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
+ if formats:
+ for f in formats:
+ finfo = self._NATIVE_FORMATS.get(f['format_id'])
+ if finfo:
+ f.update(finfo)
+ else:
+ # Use rtmp downloads
+ formats = [{
+ 'format_id': f['name'],
+ 'url': talk_info['streamer'],
+ 'play_path': f['file'],
+ 'ext': 'flv',
+ 'width': f['width'],
+ 'height': f['height'],
+ 'tbr': f['bitrate'],
+ } for f in talk_info['resources']['rtmp']]
+ self._sort_formats(formats)
+
+ video_id = compat_str(talk_info['id'])
# subtitles
- video_subtitles = self.extract_subtitles(video_id, webpage)
+ video_subtitles = self.extract_subtitles(video_id, talk_info)
if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, webpage)
+ self._list_available_subtitles(video_id, talk_info)
return
+ thumbnail = talk_info['thumb']
+ if not thumbnail.startswith('http'):
+ thumbnail = 'http://' + thumbnail
return {
'id': video_id,
- 'title': title,
+ 'title': talk_info['title'],
+ 'uploader': talk_info['speaker'],
'thumbnail': thumbnail,
- 'description': desc,
+ 'description': self._og_search_description(webpage),
'subtitles': video_subtitles,
'formats': formats,
}
- def _get_available_subtitles(self, video_id, webpage):
- try:
- options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
- languages = re.findall(r'(?:<option value=")(\S+)"', options)
- if languages:
- sub_lang_list = {}
- for l in languages:
- url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
- sub_lang_list[l] = url
- return sub_lang_list
- except RegexNotFoundError:
- self._downloader.report_warning(u'video doesn\'t have subtitles')
- return {}
+ def _get_available_subtitles(self, video_id, talk_info):
+ languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
+ if languages:
+ sub_lang_list = {}
+ for l in languages:
+ url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
+ sub_lang_list[l] = url
+ return sub_lang_list
+ else:
+ self._downloader.report_warning('video doesn\'t have subtitles')
+ return {}
+
+ def _watch_info(self, url, name):
+ webpage = self._download_webpage(url, name)
+
+ config_json = self._html_search_regex(
+ r"data-config='([^']+)", webpage, 'config')
+ config = json.loads(config_json)
+ video_url = config['video']['url']
+ thumbnail = config.get('image', {}).get('url')
+
+ title = self._html_search_regex(
+ r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
+ description = self._html_search_regex(
+ [
+ r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
+ r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
+ ],
+ webpage, 'description', fatal=False)
+
+ return {
+ 'id': name,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py
new file mode 100644
index 0000000..c7d5593
--- /dev/null
+++ b/youtube_dl/extractor/testurl.py
@@ -0,0 +1,68 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class TestURLIE(InfoExtractor):
+ """ Allows adressing of the test cases as test:yout.*be_1 """
+
+ IE_DESC = False # Do not list
+ _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
+
+ def _real_extract(self, url):
+ from ..extractor import gen_extractors
+
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ extractor_id = mobj.group('extractor')
+ all_extractors = gen_extractors()
+
+ rex = re.compile(extractor_id, flags=re.IGNORECASE)
+ matching_extractors = [
+ e for e in all_extractors if rex.search(e.IE_NAME)]
+
+ if len(matching_extractors) == 0:
+ raise ExtractorError(
+ 'No extractors matching %r found' % extractor_id,
+ expected=True)
+ elif len(matching_extractors) > 1:
+ # Is it obvious which one to pick?
+ try:
+ extractor = next(
+ ie for ie in matching_extractors
+ if ie.IE_NAME.lower() == extractor_id.lower())
+ except StopIteration:
+ raise ExtractorError(
+ ('Found multiple matching extractors: %s' %
+ ' '.join(ie.IE_NAME for ie in matching_extractors)),
+ expected=True)
+ else:
+ extractor = matching_extractors[0]
+
+ num_str = mobj.group('num')
+ num = int(num_str) if num_str else 0
+
+ testcases = []
+ t = getattr(extractor, '_TEST', None)
+ if t:
+ testcases.append(t)
+ testcases.extend(getattr(extractor, '_TESTS', []))
+
+ try:
+ tc = testcases[num]
+ except IndexError:
+ raise ExtractorError(
+ ('Test case %d not found, got only %d tests' %
+ (num, len(testcases))),
+ expected=True)
+
+ self.to_screen('Test URL: %s' % tc['url'])
+
+ return {
+ '_type': 'url',
+ 'url': tc['url'],
+ 'id': video_id,
+ }
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 2c5c88b..fdae17b 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -1,33 +1,37 @@
# coding: utf-8
+from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
+
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html'
+ _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html'
_TEST = {
- u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
- u'file': u'10635995.mp4',
- u'md5': u'2e378cc28b9957607d5e88f274e637d8',
- u'info_dict': {
- u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
- u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
+ 'info_dict': {
+ 'id': '10635995',
+ 'ext': 'mp4',
+ 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle',
+ 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+ },
+ 'params': {
+ # Sometimes wat serves the whole file with the --test option
+ 'skip_download': True,
},
- u'skip': u'Sometimes wat serves the whole file with the --test option',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- id = mobj.group(1)
- webpage = self._download_webpage(url, id)
- embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"',
- webpage, 'embed url')
- embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page')
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ embed_url = self._html_search_regex(
+ r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url')
+ embed_page = self._download_webpage(embed_url, video_id,
+ 'Downloading embed player page')
wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
- wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info')
- wat_info = json.loads(wat_info)['media']
- wat_url = wat_info['url']
- return self.url_result(wat_url, 'Wat')
+ wat_info = self._download_json(
+ 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
+ return self.url_result(wat_info['media']['url'], 'Wat')
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index d607023..b6b2dba 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
import json
@@ -13,22 +15,22 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
class ThePlatformIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
- (?P<config>[^/\?]+/(?:swf|config)/select/)?
+ (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
|theplatform:)(?P<id>[^/\?&]+)'''
_TEST = {
# from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
- u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
- u'info_dict': {
- u'id': u'e9I_cZgTgIPd',
- u'ext': u'flv',
- u'title': u'Blackberry\'s big, bold Z30',
- u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
- u'duration': 247,
+ 'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
+ 'info_dict': {
+ 'id': 'e9I_cZgTgIPd',
+ 'ext': 'flv',
+ 'title': 'Blackberry\'s big, bold Z30',
+ 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
+ 'duration': 247,
},
- u'params': {
+ 'params': {
# rtmp download
- u'skip_download': True,
+ 'skip_download': True,
},
}
@@ -39,7 +41,7 @@ class ThePlatformIE(InfoExtractor):
error_msg = next(
n.attrib['abstract']
for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == u'Geographic Restriction')
+ if n.attrib.get('title') == 'Geographic Restriction')
except StopIteration:
pass
else:
@@ -52,12 +54,17 @@ class ThePlatformIE(InfoExtractor):
head = meta.find(_x('smil:head'))
body = meta.find(_x('smil:body'))
- f4m_node = body.find(_x('smil:seq/smil:video'))
+ f4m_node = body.find(_x('smil:seq//smil:video'))
if f4m_node is not None:
+ f4m_url = f4m_node.attrib['src']
+ if 'manifest.f4m?' not in f4m_url:
+ f4m_url += '?'
+ # the parameters are from syfy.com, other sites may use others,
+ # they also work for nbc.com
+ f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
formats = [{
'ext': 'flv',
- # the parameters are from syfy.com, other sites may use others
- 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3',
+ 'url': f4m_url,
}]
else:
base_url = head.find(_x('smil:meta')).attrib['base']
@@ -95,9 +102,9 @@ class ThePlatformIE(InfoExtractor):
if mobj.group('config'):
config_url = url+ '&form=json'
config_url = config_url.replace('swf/', 'config/')
- config_json = self._download_webpage(config_url, video_id, u'Downloading config')
- config = json.loads(config_json)
- smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4'
+ config_url = config_url.replace('onsite/', 'onsite/config/')
+ config = self._download_json(config_url, video_id, 'Downloading config')
+ smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
else:
smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
'format=smil&mbr=true'.format(video_id))
diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py
index 2246d27..a4aa25f 100644
--- a/youtube_dl/extractor/tinypic.py
+++ b/youtube_dl/extractor/tinypic.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from youtube_dl.utils import ExtractorError
+from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
new file mode 100644
index 0000000..ad175b8
--- /dev/null
+++ b/youtube_dl/extractor/tlc.py
@@ -0,0 +1,60 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+from .discovery import DiscoveryIE
+
+
+class TlcIE(DiscoveryIE):
+ IE_NAME = 'tlc.com'
+ _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
+
+ _TEST = {
+ 'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
+ 'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
+ 'info_dict': {
+ 'id': '853232',
+ 'ext': 'mp4',
+ 'title': 'Cake Boss: Too Big to Fly',
+ 'description': 'Buddy has taken on a high flying task.',
+ 'duration': 119,
+ },
+ }
+
+
+class TlcDeIE(InfoExtractor):
+ IE_NAME = 'tlc.de'
+ _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)'
+
+ _TEST = {
+ 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
+ 'info_dict': {
+ 'id': '3235167922001',
+ 'ext': 'mp4',
+ 'title': 'Breaking Amish: Die Welt da draußen',
+ 'uploader': 'Discovery Networks - Germany',
+ 'description': 'Vier Amische und eine Mennonitin wagen in New York'
+ ' den Sprung in ein komplett anderes Leben. Begleitet sie auf'
+ ' ihrem spannenden Weg.',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ iframe_url = self._search_regex(
+ '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage,
+ 'iframe url')
+ # Otherwise we don't get the correct 'BrightcoveExperience' element,
+ # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/
+ iframe_url = iframe_url.replace('.htm?', '.php?')
+ iframe = self._download_webpage(iframe_url, title)
+
+ return {
+ '_type': 'url',
+ 'url': BrightcoveIE._extract_brightcove_url(iframe),
+ 'ie': BrightcoveIE.ie_key(),
+ }
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py
new file mode 100644
index 0000000..34008af
--- /dev/null
+++ b/youtube_dl/extractor/toypics.py
@@ -0,0 +1,75 @@
+from .common import InfoExtractor
+import re
+
+
+class ToypicsIE(InfoExtractor):
+ IE_DESC = 'Toypics user profile'
+ _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
+ _TEST = {
+ 'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
+ 'md5': '16e806ad6d6f58079d210fe30985e08b',
+ 'info_dict': {
+ 'id': '514',
+ 'ext': 'mp4',
+ 'title': 'Chance-Bulge\'d, 2',
+ 'age_limit': 18,
+ 'uploader': 'kidsune',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ page = self._download_webpage(url, video_id)
+ video_url = self._html_search_regex(
+ r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
+ title = self._html_search_regex(
+ r'<title>Toypics - ([^<]+)</title>', page, 'title')
+ username = self._html_search_regex(
+ r'toypics.net/([^/"]+)" class="user-name">', page, 'username')
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'uploader': username,
+ 'age_limit': 18,
+ }
+
+
+class ToypicsUserIE(InfoExtractor):
+ IE_DESC = 'Toypics user profile'
+ _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ username = mobj.group('username')
+
+ profile_page = self._download_webpage(
+ url, username, note='Retrieving profile page')
+
+ video_count = int(self._search_regex(
+ r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page,
+ 'video count'))
+
+ PAGE_SIZE = 8
+ urls = []
+ page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+ for n in range(1, page_count + 1):
+ lpage_url = url + '/public/%d' % n
+ lpage = self._download_webpage(
+ lpage_url, username,
+ note='Downloading page %d/%d' % (n, page_count))
+ urls.extend(
+ re.findall(
+ r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">',
+ lpage))
+
+ return {
+ '_type': 'playlist',
+ 'id': username,
+ 'entries': [{
+ '_type': 'url',
+ 'url': eurl,
+ 'ie_key': 'Toypics',
+ } for eurl in urls]
+ }
diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py
new file mode 100644
index 0000000..57f9566
--- /dev/null
+++ b/youtube_dl/extractor/trutube.py
@@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TruTubeIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
+ _TEST = {
+ 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
+ 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
+ 'info_dict': {
+ 'id': '14880',
+ 'ext': 'flv',
+ 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
+ 'thumbnail': 're:^http:.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ video_title = self._og_search_title(webpage).strip()
+ thumbnail = self._search_regex(
+ r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
+
+ all_formats = re.finditer(
+ r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
+ formats = [{
+ 'format_id': m.group('key'),
+ 'quality': -i,
+ 'url': m.group('url'),
+ } for i, m in enumerate(all_formats)]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py
index 3ec9442..36bc36a 100644
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -1,63 +1,83 @@
-import os
+from __future__ import unicode_literals
+
+import json
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
+ int_or_none,
+ str_to_int,
)
-from ..aes import (
- aes_decrypt_text
-)
+from ..aes import aes_decrypt_text
+
class Tube8IE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/.+?/(?P<videoid>\d+)/?)$'
+ _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/',
- u'file': u'229795.mp4',
- u'md5': u'e9e0b0c86734e5e3766e653509475db0',
- u'info_dict': {
- u"description": u"hot teen Kasia grinding",
- u"uploader": u"unknown",
- u"title": u"Kasia music video",
- u"age_limit": 18,
+ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
+ 'file': '229795.mp4',
+ 'md5': 'e9e0b0c86734e5e3766e653509475db0',
+ 'info_dict': {
+ 'description': 'hot teen Kasia grinding',
+ 'uploader': 'unknown',
+ 'title': 'Kasia music video',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ video_id = mobj.group('id')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'videotitle ="([^"]+)', webpage, u'title')
- video_description = self._html_search_regex(r'>Description:</strong>(.+?)<', webpage, u'description', fatal=False)
- video_uploader = self._html_search_regex(r'>Submitted by:</strong>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False)
- thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False)
- if thumbnail:
- thumbnail = thumbnail.replace('\\/', '/')
-
- video_url = self._html_search_regex(r'"video_url":"([^"]+)', webpage, u'video_url')
- if webpage.find('"encrypted":true')!=-1:
- password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password')
- video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
+ flashvars = json.loads(self._html_search_regex(
+ r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+
+ video_url = flashvars['video_url']
+ if flashvars.get('encrypted') is True:
+ video_url = aes_decrypt_text(video_url, flashvars['video_title'], 32).decode('utf-8')
path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
- format = path.split('/')[4].split('_')[:2]
- format = "-".join(format)
+ format_id = '-'.join(path.split('/')[4].split('_')[:2])
+
+ thumbnail = flashvars.get('image_url')
+
+ title = self._html_search_regex(
+ r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+ description = self._html_search_regex(
+ r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+ uploader = self._html_search_regex(
+ r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+ webpage, 'uploader', fatal=False)
+
+ like_count = int_or_none(self._html_search_regex(
+ r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+ dislike_count = int_or_none(self._html_search_regex(
+ r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+ view_count = self._html_search_regex(
+ r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = str_to_int(view_count)
+ comment_count = self._html_search_regex(
+ r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False)
+ if comment_count:
+ comment_count = str_to_int(comment_count)
return {
'id': video_id,
- 'uploader': video_uploader,
- 'title': video_title,
- 'thumbnail': thumbnail,
- 'description': video_description,
'url': video_url,
- 'ext': extension,
- 'format': format,
- 'format_id': format,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'format_id': format_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'dislike_count': dislike_count,
+ 'comment_count': comment_count,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
new file mode 100644
index 0000000..0921cc5
--- /dev/null
+++ b/youtube_dl/extractor/tvigle.py
@@ -0,0 +1,84 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+ clean_html,
+ int_or_none,
+)
+
+
+class TvigleIE(InfoExtractor):
+ IE_NAME = 'tvigle'
+ IE_DESC = 'Интернет-телевидение Tvigle.ru'
+ _VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
+ 'md5': '09afba4616666249f087efc6dcf83cb3',
+ 'info_dict': {
+ 'id': '503081',
+ 'ext': 'flv',
+ 'title': 'Брат 2 ',
+ 'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
+ 'upload_date': '20110919',
+ },
+ },
+ {
+ 'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
+ 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
+ 'info_dict': {
+ 'id': '676433',
+ 'ext': 'flv',
+ 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
+ 'description': 'md5:027f7dc872948f14c96d19b4178428a4',
+ 'upload_date': '20121218',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ video_data = self._download_xml(
+ 'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
+
+ video = video_data.find('./video')
+
+ title = video.get('name')
+ description = video.get('anons')
+ if description:
+ description = clean_html(description)
+ thumbnail = video_data.get('img')
+ upload_date = unified_strdate(video.get('date'))
+ like_count = int_or_none(video.get('vtp'))
+
+ formats = []
+ for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
+ video_url = video.get(format_id)
+ if not video_url:
+ continue
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ 'format_note': format_note,
+ 'quality': num,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'like_count': like_count,
+ 'age_limit': 18,
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
new file mode 100644
index 0000000..054f427
--- /dev/null
+++ b/youtube_dl/extractor/udemy.py
@@ -0,0 +1,164 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+ ExtractorError,
+)
+
+
+class UdemyIE(InfoExtractor):
+ IE_NAME = 'udemy'
+ _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
+ _LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
+ _NETRC_MACHINE = 'udemy'
+
+ _TESTS = [{
+ 'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757',
+ 'md5': '98eda5b657e752cf945d8445e261b5c5',
+ 'info_dict': {
+ 'id': '160614',
+ 'ext': 'mp4',
+ 'title': 'Introduction and Installation',
+ 'description': 'md5:c0d51f6f21ef4ec65f091055a5eef876',
+ 'duration': 579.29,
+ },
+ 'skip': 'Requires udemy account credentials',
+ }]
+
+ def _handle_error(self, response):
+ if not isinstance(response, dict):
+ return
+ error = response.get('error')
+ if error:
+ error_str = 'Udemy returned error #%s: %s' % (error.get('code'), error.get('message'))
+ error_data = error.get('data')
+ if error_data:
+ error_str += ' - %s' % error_data.get('formErrors')
+ raise ExtractorError(error_str, expected=True)
+
+ def _download_json(self, url, video_id, note='Downloading JSON metadata'):
+ response = super(UdemyIE, self)._download_json(url, video_id, note)
+ self._handle_error(response)
+ return response
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ raise ExtractorError(
+ 'Udemy account is required, use --username and --password options to provide account credentials.',
+ expected=True)
+
+ login_popup = self._download_webpage(
+ 'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None,
+ 'Downloading login popup')
+
+ if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
+ return
+
+ csrf = self._html_search_regex(r'<input type="hidden" name="csrf" value="(.+?)"', login_popup, 'csrf token')
+
+ login_form = {
+ 'email': username,
+ 'password': password,
+ 'csrf': csrf,
+ 'displayType': 'json',
+ 'isSubmitted': '1',
+ }
+ request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ response = self._download_json(request, None, 'Logging in as %s' % username)
+
+ if 'returnUrl' not in response:
+ raise ExtractorError('Unable to log in')
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ lecture_id = mobj.group('id')
+
+ lecture = self._download_json(
+ 'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id, lecture_id, 'Downloading lecture JSON')
+
+ if lecture['assetType'] != 'Video':
+ raise ExtractorError('Lecture %s is not a video' % lecture_id, expected=True)
+
+ asset = lecture['asset']
+
+ stream_url = asset['streamUrl']
+ mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url)
+ if mobj:
+ return self.url_result(mobj.group(1), 'Youtube')
+
+ video_id = asset['id']
+ thumbnail = asset['thumbnailUrl']
+ duration = asset['data']['duration']
+
+ download_url = asset['downloadUrl']
+
+ formats = [
+ {
+ 'url': download_url['Video480p'][0],
+ 'format_id': '360p',
+ },
+ {
+ 'url': download_url['Video'][0],
+ 'format_id': '720p',
+ },
+ ]
+
+ title = lecture['title']
+ description = lecture['description']
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats
+ }
+
+
+class UdemyCourseIE(UdemyIE):
+ IE_NAME = 'udemy:course'
+ _VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)'
+ _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<'
+ _ALREADY_ENROLLED = '>You are already taking this course.<'
+ _TESTS = []
+
+ @classmethod
+ def suitable(cls, url):
+ return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_path = mobj.group('coursepath')
+
+ response = self._download_json(
+ 'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON')
+
+ course_id = int(response['id'])
+ course_title = response['title']
+
+ webpage = self._download_webpage(
+ 'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id, course_id, 'Enrolling in the course')
+
+ if self._SUCCESSFULLY_ENROLLED in webpage:
+ self.to_screen('%s: Successfully enrolled in' % course_id)
+ elif self._ALREADY_ENROLLED in webpage:
+ self.to_screen('%s: Already enrolled in' % course_id)
+
+ response = self._download_json('https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
+ course_id, 'Downloading course curriculum')
+
+ entries = [
+ self.url_result('https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy')
+ for asset in response if asset.get('assetType') == 'Video'
+ ]
+
+ return self.playlist_result(entries, course_id, course_title) \ No newline at end of file
diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py
new file mode 100644
index 0000000..5d06fcc
--- /dev/null
+++ b/youtube_dl/extractor/urort.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ unified_strdate,
+)
+
+
+class UrortIE(InfoExtractor):
+ IE_DESC = 'NRK P3 Urørt'
+ _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
+
+ _TEST = {
+ 'url': 'https://urort.p3.no/#!/Band/Gerilja',
+ 'md5': '5ed31a924be8a05e47812678a86e127b',
+ 'info_dict': {
+ 'id': '33124-4',
+ 'ext': 'mp3',
+ 'title': 'The Bomb',
+ 'thumbnail': 're:^https?://.+\.jpg',
+ 'like_count': int,
+ 'uploader': 'Gerilja',
+ 'uploader_id': 'Gerilja',
+ 'upload_date': '20100323',
+ },
+ 'params': {
+ 'matchtitle': '^The Bomb$', # To test, we want just one video
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+
+ fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
+ json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
+ songs = self._download_json(json_url, playlist_id)
+ print(songs[0])
+
+ entries = [{
+ 'id': '%d-%s' % (s['BandId'], s['$id']),
+ 'title': s['Title'],
+ 'url': s['TrackUrl'],
+ 'ext': 'mp3',
+ 'uploader_id': playlist_id,
+ 'uploader': s.get('BandName', playlist_id),
+ 'like_count': s.get('LikeCount'),
+ 'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
+ 'upload_date': unified_strdate(s.get('Released')),
+ } for s in songs]
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_id,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 7fa2b9e..488b10d 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -11,15 +11,16 @@ from ..utils import (
class UstreamIE(InfoExtractor):
- _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
+ _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)'
IE_NAME = 'ustream'
_TEST = {
'url': 'http://www.ustream.tv/recorded/20274954',
- 'file': '20274954.flv',
'md5': '088f151799e8f572f84eb62f17d73e5c',
'info_dict': {
- "uploader": "Young Americans for Liberty",
- "title": "Young Americans for Liberty February 7, 2012 2:28 AM",
+ 'id': '20274954',
+ 'ext': 'flv',
+ 'uploader': 'Young Americans for Liberty',
+ 'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',
},
}
@@ -27,6 +28,19 @@ class UstreamIE(InfoExtractor):
m = re.match(self._VALID_URL, url)
video_id = m.group('videoID')
+ # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990)
+ if m.group('type') == 'embed/recorded':
+ video_id = m.group('videoID')
+ desktop_url = 'http://www.ustream.tv/recorded/' + video_id
+ return self.url_result(desktop_url, 'Ustream')
+ if m.group('type') == 'embed':
+ video_id = m.group('videoID')
+ webpage = self._download_webpage(url, video_id)
+ desktop_video_id = self._html_search_regex(
+ r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id')
+ desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id
+ return self.url_result(desktop_url, 'Ustream')
+
video_url = 'http://tcdn.ustream.tv/video/%s' % video_id
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index baa57f3..d16993d 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -4,44 +4,118 @@ import re
import json
from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+ int_or_none,
+)
class VeohIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'
-
- _TEST = {
- 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- 'file': '56314296.mp4',
- 'md5': '620e68e6a3cff80086df3348426c9ca3',
- 'info_dict': {
- 'title': 'Straight Backs Are Stronger',
- 'uploader': 'LUMOback',
- 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
+ 'info_dict': {
+ 'id': '56314296',
+ 'ext': 'mp4',
+ 'title': 'Straight Backs Are Stronger',
+ 'uploader': 'LUMOback',
+ 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ },
+ },
+ {
+ 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
+ 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
+ 'info_dict': {
+ 'id': '27701988',
+ 'ext': 'mp4',
+ 'title': 'Chile workers cover up to avoid skin damage',
+ 'description': 'md5:2bd151625a60a32822873efc246ba20d',
+ 'uploader': 'afp-news',
+ 'duration': 123,
+ },
+ },
+ {
+ 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
+ 'md5': '4fde7b9e33577bab2f2f8f260e30e979',
+ 'note': 'Embedded ooyala video',
+ 'info_dict': {
+ 'id': '69525809',
+ 'ext': 'mp4',
+ 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
+ 'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
+ 'uploader': 'newsy-videos',
+ },
+ },
+ ]
+
+ def _extract_formats(self, source):
+ formats = []
+ link = source.get('aowPermalink')
+ if link:
+ formats.append({
+ 'url': link,
+ 'ext': 'mp4',
+ 'format_id': 'aow',
+ })
+ link = source.get('fullPreviewHashLowPath')
+ if link:
+ formats.append({
+ 'url': link,
+ 'format_id': 'low',
+ })
+ link = source.get('fullPreviewHashHighPath')
+ if link:
+ formats.append({
+ 'url': link,
+ 'format_id': 'high',
+ })
+ return formats
+
+ def _extract_video(self, source):
+ return {
+ 'id': source.get('videoId'),
+ 'title': source.get('title'),
+ 'description': source.get('description'),
+ 'thumbnail': source.get('highResImage') or source.get('medResImage'),
+ 'uploader': source.get('username'),
+ 'duration': int_or_none(source.get('length')),
+ 'view_count': int_or_none(source.get('views')),
+ 'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
+ 'formats': self._extract_formats(source),
}
- }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
+
+ if video_id.startswith('v'):
+ rsp = self._download_xml(
+ r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML')
+ if rsp.get('stat') == 'ok':
+ return self._extract_video(rsp.find('./videoList/video'))
+
webpage = self._download_webpage(url, video_id)
+ age_limit = 0
+ if 'class="adultwarning-container"' in webpage:
+ self.report_age_confirmation()
+ age_limit = 18
+ request = compat_urllib_request.Request(url)
+ request.add_header('Cookie', 'confirmedAdult=true')
+ webpage = self._download_webpage(request, video_id)
- m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+ m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|"|\?)', webpage)
if m_youtube is not None:
youtube_id = m_youtube.group(1)
self.to_screen('%s: detected Youtube video.' % video_id)
return self.url_result(youtube_id, 'Youtube')
- self.report_extraction(video_id)
- info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
- info = json.loads(info)
- video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+ info = json.loads(
+ self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info').replace('\\\'', '\''))
- return {
- 'id': info['videoId'],
- 'title': info['title'],
- 'url': video_url,
- 'uploader': info['username'],
- 'thumbnail': info.get('highResImage') or info.get('medResImage'),
- 'description': info['description'],
- 'view_count': info['views'],
- }
+ video = self._extract_video(info)
+ video['age_limit'] = age_limit
+
+ return video
diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py
index f51d4dc..27f9acb 100644
--- a/youtube_dl/extractor/vesti.py
+++ b/youtube_dl/extractor/vesti.py
@@ -4,14 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- int_or_none
-)
+from ..utils import ExtractorError
+from .rutv import RUTVIE
class VestiIE(InfoExtractor):
- IE_NAME = 'vesti'
IE_DESC = 'Вести.Ru'
_VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
@@ -31,6 +28,20 @@ class VestiIE(InfoExtractor):
},
},
{
+ 'url': 'http://www.vesti.ru/doc.html?id=1349233',
+ 'info_dict': {
+ 'id': '773865',
+ 'ext': 'mp4',
+ 'title': 'Участники митинга штурмуют Донецкую областную администрацию',
+ 'description': 'md5:1a160e98b3195379b4c849f2f4958009',
+ 'duration': 210,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'http://www.vesti.ru/only_video.html?vid=576180',
'info_dict': {
'id': '766048',
@@ -45,6 +56,20 @@ class VestiIE(InfoExtractor):
},
},
{
+ 'url': 'http://hitech.vesti.ru/news/view/id/4000',
+ 'info_dict': {
+ 'id': '766888',
+ 'ext': 'mp4',
+ 'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+ 'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+ 'duration': 279,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
'info_dict': {
'id': '766403',
@@ -57,7 +82,7 @@ class VestiIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
- 'skip': 'Blocked outside Russia'
+ 'skip': 'Blocked outside Russia',
},
{
'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
@@ -72,7 +97,7 @@ class VestiIE(InfoExtractor):
'skip_download': True,
},
'skip': 'Translation has finished'
- }
+ },
]
def _real_extract(self, url):
@@ -81,90 +106,16 @@ class VestiIE(InfoExtractor):
page = self._download_webpage(url, video_id, 'Downloading page')
- mobj = re.search(r'<meta property="og:video" content=".+?\.swf\?v?id=(?P<id>\d+).*?" />', page)
+ mobj = re.search(
+ r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
+ page)
if mobj:
- video_type = 'video'
- video_id = mobj.group('id')
- else:
- mobj = re.search(
- r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page)
-
- if not mobj:
- raise ExtractorError('No media found')
-
- video_type = mobj.group('type')
video_id = mobj.group('id')
+ page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
+ 'Downloading video page')
- json_data = self._download_json(
- 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
- video_id, 'Downloading JSON')
-
- if json_data['errors']:
- raise ExtractorError('vesti returned error: %s' % json_data['errors'], expected=True)
-
- playlist = json_data['data']['playlist']
- medialist = playlist['medialist']
- media = medialist[0]
-
- if media['errors']:
- raise ExtractorError('vesti returned error: %s' % media['errors'], expected=True)
-
- view_count = playlist.get('count_views')
- priority_transport = playlist['priority_transport']
-
- thumbnail = media['picture']
- width = media['width']
- height = media['height']
- description = media['anons']
- title = media['title']
- duration = int_or_none(media.get('duration'))
-
- formats = []
-
- for transport, links in media['sources'].items():
- for quality, url in links.items():
- if transport == 'rtmp':
- mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
- if not mobj:
- continue
- fmt = {
- 'url': mobj.group('url'),
- 'play_path': mobj.group('playpath'),
- 'app': mobj.group('app'),
- 'page_url': 'http://player.rutv.ru',
- 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
- 'rtmp_live': True,
- 'ext': 'flv',
- 'vbr': int(quality),
- }
- elif transport == 'm3u8':
- fmt = {
- 'url': url,
- 'ext': 'mp4',
- }
- else:
- fmt = {
- 'url': url
- }
- fmt.update({
- 'width': width,
- 'height': height,
- 'format_id': '%s-%s' % (transport, quality),
- 'preference': -1 if priority_transport == transport else -2,
- })
- formats.append(fmt)
-
- if not formats:
- raise ExtractorError('No media links available for %s' % video_id)
-
- self._sort_formats(formats)
+ rutv_url = RUTVIE._extract_url(page)
+ if rutv_url:
+ return self.url_result(rutv_url, 'RUTV')
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'view_count': view_count,
- 'duration': duration,
- 'formats': formats,
- } \ No newline at end of file
+ raise ExtractorError('No video found', expected=True) \ No newline at end of file
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index e458ac9..eada13c 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -2,7 +2,6 @@ from __future__ import unicode_literals
import re
import xml.etree.ElementTree
-import datetime
from .common import InfoExtractor
from ..utils import (
@@ -17,22 +16,55 @@ class VevoIE(InfoExtractor):
(currently used by MTVIE)
"""
_VALID_URL = r'''(?x)
- (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?|
+ (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|
https?://cache\.vevo\.com/m/html/embed\.html\?video=|
https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
vevo:)
(?P<id>[^&?#]+)'''
+
_TESTS = [{
'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
- 'file': 'GB1101300280.mp4',
"md5": "06bea460acb744eab74a9d7dcb4bfd61",
'info_dict': {
+ 'id': 'GB1101300280',
+ 'ext': 'mp4',
"upload_date": "20130624",
"uploader": "Hurts",
"title": "Somebody to Die For",
"duration": 230.12,
"width": 1920,
"height": 1080,
+ # timestamp and upload_date are often incorrect; seem to change randomly
+ 'timestamp': int,
+ }
+ }, {
+ 'note': 'v3 SMIL format',
+ 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
+ 'md5': '893ec0e0d4426a1d96c01de8f2bdff58',
+ 'info_dict': {
+ 'id': 'USUV71302923',
+ 'ext': 'mp4',
+ 'upload_date': '20140219',
+ 'uploader': 'Cassadee Pope',
+ 'title': 'I Wish I Could Break Your Heart',
+ 'duration': 226.101,
+ 'age_limit': 0,
+ 'timestamp': int,
+ }
+ }, {
+ 'note': 'Age-limited video',
+ 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
+ 'info_dict': {
+ 'id': 'USRV81300282',
+ 'ext': 'mp4',
+ 'age_limit': 18,
+ 'title': 'Tunnel Vision (Explicit)',
+ 'uploader': 'Justin Timberlake',
+ 'upload_date': 're:2013070[34]',
+ 'timestamp': int,
+ },
+ 'params': {
+ 'skip_download': 'true',
}
}]
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
@@ -102,12 +134,40 @@ class VevoIE(InfoExtractor):
video_id = mobj.group('id')
json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
- video_info = self._download_json(json_url, video_id)['video']
+ response = self._download_json(json_url, video_id)
+ video_info = response['video']
+
+ if not video_info:
+ if 'statusMessage' in response:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True)
+ raise ExtractorError('Unable to extract videos')
formats = self._formats_from_json(video_info)
+
+ is_explicit = video_info.get('isExplicit')
+ if is_explicit is True:
+ age_limit = 18
+ elif is_explicit is False:
+ age_limit = 0
+ else:
+ age_limit = None
+
+ # Download SMIL
+ smil_blocks = sorted((
+ f for f in video_info['videoVersions']
+ if f['sourceType'] == 13),
+ key=lambda f: f['version'])
+
+ smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+ self._SMIL_BASE_URL, video_id, video_id.lower())
+ if smil_blocks:
+ smil_url_m = self._search_regex(
+ r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL',
+ fatal=False)
+ if smil_url_m is not None:
+ smil_url = smil_url_m
+
try:
- smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
- self._SMIL_BASE_URL, video_id, video_id.lower())
smil_xml = self._download_webpage(smil_url, video_id,
'Downloading SMIL info')
formats.extend(self._formats_from_smil(smil_xml))
@@ -119,13 +179,14 @@ class VevoIE(InfoExtractor):
timestamp_ms = int(self._search_regex(
r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
- upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
+
return {
'id': video_id,
'title': video_info['title'],
'formats': formats,
'thumbnail': video_info['imageUrl'],
- 'upload_date': upload_date.strftime('%Y%m%d'),
+ 'timestamp': timestamp_ms // 1000,
'uploader': video_info['mainArtists'][0]['artistName'],
'duration': video_info['duration'],
+ 'age_limit': age_limit,
}
diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py
new file mode 100644
index 0000000..2f77e38
--- /dev/null
+++ b/youtube_dl/extractor/vh1.py
@@ -0,0 +1,124 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVIE
+
+import re
+from ..utils import fix_xml_ampersands
+
+
+class VH1IE(MTVIE):
+ IE_NAME = 'vh1.com'
+ _FEED_URL = 'http://www.vh1.com/player/embed/AS3/fullepisode/rss/'
+ _TESTS = [{
+ 'url': 'http://www.vh1.com/video/metal-evolution/full-episodes/progressive-metal/1678612/playlist.jhtml',
+ 'playlist': [
+ {
+ 'md5': '7827a7505f59633983165bbd2c119b52',
+ 'info_dict': {
+ 'id': '731565',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Ep. 11 Act 1',
+ 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 12 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+ }
+ },
+ {
+ 'md5': '34fb4b7321c546b54deda2102a61821f',
+ 'info_dict': {
+ 'id': '731567',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Ep. 11 Act 2',
+ 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+ }
+ },
+ {
+ 'md5': '813f38dba4c1b8647196135ebbf7e048',
+ 'info_dict': {
+ 'id': '731568',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Ep. 11 Act 3',
+ 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+ }
+ },
+ {
+ 'md5': '51adb72439dfaed11c799115d76e497f',
+ 'info_dict': {
+ 'id': '731569',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Ep. 11 Act 4',
+ 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+ }
+ },
+ {
+ 'md5': '93d554aaf79320703b73a95288c76a6e',
+ 'info_dict': {
+ 'id': '731570',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Ep. 11 Act 5',
+ 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+ }
+ }
+ ],
+ 'skip': 'Blocked outside the US',
+ }, {
+ # Clip
+ 'url': 'http://www.vh1.com/video/misc/706675/metal-evolution-episode-1-pre-metal-show-clip.jhtml#id=1674118',
+ 'md5': '7d67cf6d9cdc6b4f3d3ac97a55403844',
+ 'info_dict': {
+ 'id': '706675',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Episode 1 Pre-Metal Show Clip',
+ 'description': 'The greatest documentary ever made about Heavy Metal begins as our host Sam Dunn travels the globe to seek out the origins and influences that helped create Heavy Metal. Sam speaks to legends like Kirk Hammett, Alice Cooper, Slash, Bill Ward, Geezer Butler, Tom Morello, Ace Frehley, Lemmy Kilmister, Dave Davies, and many many more. This episode is the prologue for the 11 hour series, and Sam goes back to the very beginning to reveal how Heavy Metal was created.'
+ },
+ 'skip': 'Blocked outside the US',
+ }, {
+ # Short link
+ 'url': 'http://www.vh1.com/video/play.jhtml?id=1678353',
+ 'md5': '853192b87ad978732b67dd8e549b266a',
+ 'info_dict': {
+ 'id': '730355',
+ 'ext': 'mp4',
+ 'title': 'Metal Evolution: Episode 11 Progressive Metal Sneak',
+ 'description': 'In Metal Evolution\'s finale sneak, Sam sits with Michael Giles of King Crimson and gets feedback from Metallica guitarist Kirk Hammett on why the group was influential.'
+ },
+ 'skip': 'Blocked outside the US',
+ }, {
+ 'url': 'http://www.vh1.com/video/macklemore-ryan-lewis/900535/cant-hold-us-ft-ray-dalton.jhtml',
+ 'md5': 'b1bcb5b4380c9d7f544065589432dee7',
+ 'info_dict': {
+ 'id': '900535',
+ 'ext': 'mp4',
+ 'title': 'Macklemore & Ryan Lewis - "Can\'t Hold Us ft. Ray Dalton"',
+ 'description': 'The Heist'
+ },
+ 'skip': 'Blocked outside the US',
+ }]
+
+ _VALID_URL = r'''(?x)
+ https?://www\.vh1\.com/video/
+ (?:
+ .+?/full-episodes/.+?/(?P<playlist_id>[^/]+)/playlist\.jhtml
+ |
+ (?:
+ play.jhtml\?id=|
+ misc/.+?/.+?\.jhtml\#id=
+ )
+ (?P<video_id>[0-9]+)$
+ |
+ [^/]+/(?P<music_id>[0-9]+)/[^/]+?
+ )
+ '''
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj.group('music_id'):
+ id_field = 'vid'
+ video_id = mobj.group('music_id')
+ else:
+ video_id = mobj.group('playlist_id') or mobj.group('video_id')
+ id_field = 'id'
+ doc_url = '%s?%s=%s' % (self._FEED_URL, id_field, video_id)
+
+ idoc = self._download_xml(
+ doc_url, video_id,
+ 'Downloading info', transform_source=fix_xml_ampersands)
+ return [self._get_video_info(item) for item in idoc.findall('.//item')]
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
deleted file mode 100644
index 87812d6..0000000
--- a/youtube_dl/extractor/vice.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import re
-
-from .common import InfoExtractor
-from .ooyala import OoyalaIE
-from ..utils import ExtractorError
-
-
-class ViceIE(InfoExtractor):
- _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
-
- _TEST = {
- u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4',
- u'info_dict': {
- u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- },
- u'params': {
- # Requires ffmpeg (m3u8 manifest)
- u'skip_download': True,
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- webpage = self._download_webpage(url, name)
- try:
- ooyala_url = self._og_search_video_url(webpage)
- except ExtractorError:
- try:
- embed_code = self._search_regex(
- r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage,
- u'ooyala embed code')
- ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
- except ExtractorError:
- raise ExtractorError(u'The page doesn\'t contain a video', expected=True)
- return self.url_result(ooyala_url, ie='Ooyala')
-
diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py
new file mode 100644
index 0000000..fed95ef
--- /dev/null
+++ b/youtube_dl/extractor/videobam.py
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VideoBamIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://videobam.com/OiJQM',
+ 'md5': 'db471f27763a531f10416a0c58b5a1e0',
+ 'info_dict': {
+ 'id': 'OiJQM',
+ 'ext': 'mp4',
+ 'title': 'Is Alcohol Worse Than Ecstasy?',
+ 'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
+ 'uploader': 'frihetsvinge',
+ },
+ },
+ {
+ 'url': 'http://videobam.com/pqLvq',
+ 'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
+ 'note': 'HD video',
+ 'info_dict': {
+ 'id': 'pqLvq',
+ 'ext': 'mp4',
+ 'title': '_',
+ }
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
+
+ formats = []
+
+ for preference, format_id in enumerate(['low', 'high']):
+ mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
+ if not mobj:
+ continue
+ formats.append({
+ 'url': mobj.group('url'),
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ 'preference': preference,
+ })
+
+ if not formats:
+ player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
+ formats = [{
+ 'url': item['url'],
+ 'ext': 'mp4',
+ } for item in player_config['playlist'] if 'autoPlay' in item]
+
+ self._sort_formats(formats)
+
+ title = self._og_search_title(page, default='_', fatal=False)
+ description = self._og_search_description(page, default=None)
+ thumbnail = self._og_search_thumbnail(page)
+ uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
+ view_count = int_or_none(
+ self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'age_limit': 18,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
index 265dd5b..ac6c255 100644
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@@ -1,22 +1,23 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from .internetvideoarchive import InternetVideoArchiveIE
-from ..utils import (
- compat_urlparse,
-)
+from ..utils import compat_urlparse
class VideoDetectiveIE(InfoExtractor):
_VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
_TEST = {
- u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
- u'file': u'194487.mp4',
- u'info_dict': {
- u'title': u'KICK-ASS 2',
- u'description': u'md5:65ba37ad619165afac7d432eaded6013',
- u'duration': 135,
+ 'url': 'http://www.videodetective.com/movies/kick-ass-2/194487',
+ 'info_dict': {
+ 'id': '194487',
+ 'ext': 'mp4',
+ 'title': 'KICK-ASS 2',
+ 'description': 'md5:65ba37ad619165afac7d432eaded6013',
+ 'duration': 135,
},
}
@@ -26,5 +27,4 @@ class VideoDetectiveIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage)
query = compat_urlparse.urlparse(og_video).query
- return self.url_result(InternetVideoArchiveIE._build_url(query),
- ie=InternetVideoArchiveIE.ie_key())
+ return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
new file mode 100644
index 0000000..ebd2a3d
--- /dev/null
+++ b/youtube_dl/extractor/videolecturesnet.py
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ find_xpath_attr,
+ int_or_none,
+ parse_duration,
+ unified_strdate,
+)
+
+
+class VideoLecturesNetIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
+ IE_NAME = 'videolectures.net'
+
+ _TEST = {
+ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
+ 'info_dict': {
+ 'id': 'promogram_igor_mekjavic_eng',
+ 'ext': 'mp4',
+ 'title': 'Automatics, robotics and biocybernetics',
+ 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+ 'upload_date': '20130627',
+ 'duration': 565,
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
+ smil = self._download_xml(smil_url, video_id)
+
+ title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
+ description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract')
+ description = (
+ None if description_el is None
+ else description_el.attrib['content'])
+ upload_date = unified_strdate(
+ find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
+
+ switch = smil.find('.//switch')
+ duration = parse_duration(switch.attrib.get('dur'))
+ thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
+ thumbnail = (
+ None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
+
+ formats = [{
+ 'url': v.attrib['src'],
+ 'width': int_or_none(v.attrib.get('width')),
+ 'height': int_or_none(v.attrib.get('height')),
+ 'filesize': int_or_none(v.attrib.get('size')),
+ 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
+ 'ext': v.attrib.get('ext'),
+ } for v in switch.findall('./video')
+ if v.attrib.get('proto') == 'http']
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
new file mode 100644
index 0000000..b5034b0
--- /dev/null
+++ b/youtube_dl/extractor/videott.py
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class VideoTtIE(InfoExtractor):
+ ID_NAME = 'video.tt'
+ IE_DESC = 'video.tt - Your True Tube'
+ _VALID_URL = r'http://(?:www\.)?video\.tt/(?:video/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
+
+ _TEST = {
+ 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8',
+ 'md5': 'b13aa9e2f267effb5d1094443dff65ba',
+ 'info_dict': {
+ 'id': 'amd5YujV8',
+ 'ext': 'flv',
+ 'title': 'Motivational video Change your mind in just 2.50 mins',
+ 'description': '',
+ 'upload_date': '20130827',
+ 'uploader': 'joseph313',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ settings = self._download_json(
+ 'http://www.video.tt/player_control/settings.php?v=%s' % video_id, video_id,
+ 'Downloading video JSON')['settings']
+
+ video = settings['video_details']['video']
+
+ formats = [
+ {
+ 'url': base64.b64decode(res['u']).decode('utf-8'),
+ 'ext': 'flv',
+ 'format_id': res['l'],
+ } for res in settings['res'] if res['u']
+ ]
+
+ return {
+ 'id': video_id,
+ 'title': video['title'],
+ 'description': video['description'],
+ 'thumbnail': settings['config']['thumbnail'],
+ 'upload_date': unified_strdate(video['added']),
+ 'uploader': video['owner'],
+ 'view_count': int(video['view_count']),
+ 'comment_count': int(video['comment_count']),
+ 'like_count': int(video['liked']),
+ 'dislike_count': int(video['disliked']),
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py
new file mode 100644
index 0000000..4a08ddd
--- /dev/null
+++ b/youtube_dl/extractor/videoweed.py
@@ -0,0 +1,26 @@
+from __future__ import unicode_literals
+
+from .novamov import NovaMovIE
+
+
+class VideoWeedIE(NovaMovIE):
+ IE_NAME = 'videoweed'
+ IE_DESC = 'VideoWeed'
+
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'}
+
+ _HOST = 'www.videoweed.es'
+
+ _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+ _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>'
+
+ _TEST = {
+ 'url': 'http://www.videoweed.es/file/b42178afbea14',
+ 'md5': 'abd31a2132947262c50429e1d16c1bfd',
+ 'info_dict': {
+ 'id': 'b42178afbea14',
+ 'ext': 'flv',
+ 'title': 'optical illusion dissapeared image magic illusion',
+ 'description': ''
+ },
+ } \ No newline at end of file
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 2206a06..15f3152 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -1,29 +1,33 @@
+from __future__ import unicode_literals
+
import re
from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
+ US_RATINGS,
)
from .subtitles import SubtitlesInfoExtractor
class VikiIE(SubtitlesInfoExtractor):
- IE_NAME = u'viki'
+ IE_NAME = 'viki'
_VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
_TEST = {
- u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
- u'file': u'1023585v.mp4',
- u'md5': u'a21454021c2646f5433514177e2caa5f',
- u'info_dict': {
- u'title': u'Heirs Episode 14',
- u'uploader': u'SBS',
- u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
- u'upload_date': u'20131121',
- u'age_limit': 13,
+ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
+ 'md5': 'a21454021c2646f5433514177e2caa5f',
+ 'info_dict': {
+ 'id': '1023585v',
+ 'ext': 'mp4',
+ 'title': 'Heirs Episode 14',
+ 'uploader': 'SBS',
+ 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ 'upload_date': '20131121',
+ 'age_limit': 13,
},
- u'skip': u'Blocked in the US',
+ 'skip': 'Blocked in the US',
}
def _real_extract(self, url):
@@ -44,28 +48,21 @@ class VikiIE(SubtitlesInfoExtractor):
rating_str = self._html_search_regex(
r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
- u'rating information', default='').strip()
- RATINGS = {
- 'G': 0,
- 'PG': 10,
- 'PG-13': 13,
- 'R': 16,
- 'NC': 18,
- }
- age_limit = RATINGS.get(rating_str)
+ 'rating information', default='').strip()
+ age_limit = US_RATINGS.get(rating_str)
info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
info_webpage = self._download_webpage(
- info_url, video_id, note=u'Downloading info page')
+ info_url, video_id, note='Downloading info page')
if re.match(r'\s*<div\s+class="video-error', info_webpage):
raise ExtractorError(
- u'Video %s is blocked from your location.' % video_id,
+ 'Video %s is blocked from your location.' % video_id,
expected=True)
video_url = self._html_search_regex(
- r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+ r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
upload_date_str = self._html_search_regex(
- r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+ r'"created_at":"([^"]+)"', info_webpage, 'upload date')
upload_date = (
unified_strdate(upload_date_str)
if upload_date_str is not None
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 4bc2620..2558555 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -8,6 +8,7 @@ import itertools
from .common import InfoExtractor
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
+ compat_HTTPError,
compat_urllib_parse,
compat_urllib_request,
clean_html,
@@ -16,10 +17,39 @@ from ..utils import (
RegexNotFoundError,
std_headers,
unsmuggle_url,
+ urlencode_postdata,
+ int_or_none,
)
-class VimeoIE(SubtitlesInfoExtractor):
+class VimeoBaseInfoExtractor(InfoExtractor):
+ _NETRC_MACHINE = 'vimeo'
+ _LOGIN_REQUIRED = False
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ if self._LOGIN_REQUIRED:
+ raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ return
+ self.report_login()
+ login_url = 'https://vimeo.com/log_in'
+ webpage = self._download_webpage(login_url, None, False)
+ token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
+ data = urlencode_postdata({
+ 'email': username,
+ 'password': password,
+ 'action': 'login',
+ 'service': 'vimeo',
+ 'token': token,
+ })
+ login_request = compat_urllib_request.Request(login_url, data)
+ login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ login_request.add_header('Cookie', 'xsrft=%s' % token)
+ self._download_webpage(login_request, None, False, 'Wrong login info')
+
+
+class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
@@ -32,53 +62,60 @@ class VimeoIE(SubtitlesInfoExtractor):
(?:videos?/)?
(?P<id>[0-9]+)
/?(?:[?&].*)?(?:[#].*)?$'''
- _NETRC_MACHINE = 'vimeo'
IE_NAME = 'vimeo'
_TESTS = [
{
'url': 'http://vimeo.com/56015672#at=0',
- 'file': '56015672.mp4',
'md5': '8879b6cc097e987f02484baf890129e5',
'info_dict': {
- "upload_date": "20121220",
- "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- "uploader_id": "user7108434",
- "uploader": "Filippo Valsorda",
+ 'id': '56015672',
+ 'ext': 'mp4',
+ "upload_date": "20121220",
+ "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ "uploader_id": "user7108434",
+ "uploader": "Filippo Valsorda",
"title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+ "duration": 10,
},
},
{
'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
- 'file': '68093876.mp4',
'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82',
'note': 'Vimeo Pro video (#1197)',
'info_dict': {
+ 'id': '68093876',
+ 'ext': 'mp4',
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+ 'duration': 1595,
},
},
{
'url': 'http://player.vimeo.com/video/54469442',
- 'file': '54469442.mp4',
'md5': '619b811a4417aa4abe78dc653becf511',
'note': 'Videos that embed the url in the player page',
'info_dict': {
+ 'id': '54469442',
+ 'ext': 'mp4',
'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software',
'uploader': 'The BLN & Business of Software',
'uploader_id': 'theblnbusinessofsoftware',
+ 'duration': 3610,
},
},
{
'url': 'http://vimeo.com/68375962',
- 'file': '68375962.mp4',
'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
'note': 'Video protected with password',
'info_dict': {
+ 'id': '68375962',
+ 'ext': 'mp4',
'title': 'youtube-dl password protected test video',
'upload_date': '20130614',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
+ 'duration': 10,
},
'params': {
'videopassword': 'youtube-dl',
@@ -96,42 +133,35 @@ class VimeoIE(SubtitlesInfoExtractor):
'upload_date': '20131015',
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
+ 'duration': 62,
}
},
]
- def _login(self):
- (username, password) = self._get_login_info()
- if username is None:
- return
- self.report_login()
- login_url = 'https://vimeo.com/log_in'
- webpage = self._download_webpage(login_url, None, False)
- token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
- data = compat_urllib_parse.urlencode({'email': username,
- 'password': password,
- 'action': 'login',
- 'service': 'vimeo',
- 'token': token,
- })
- login_request = compat_urllib_request.Request(login_url, data)
- login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- login_request.add_header('Cookie', 'xsrft=%s' % token)
- self._download_webpage(login_request, None, False, 'Wrong login info')
+ @classmethod
+ def suitable(cls, url):
+ if VimeoChannelIE.suitable(url):
+ # Otherwise channel urls like http://vimeo.com/channels/31259 would
+ # match
+ return False
+ else:
+ return super(VimeoIE, cls).suitable(url)
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
- data = compat_urllib_parse.urlencode({'password': password,
- 'token': token})
+ data = compat_urllib_parse.urlencode({
+ 'password': password,
+ 'token': token,
+ })
# I didn't manage to use the password with https
if url.startswith('https'):
- pass_url = url.replace('https','http')
+ pass_url = url.replace('https', 'http')
else:
pass_url = url
- password_request = compat_urllib_request.Request(pass_url+'/password', data)
+ password_request = compat_urllib_request.Request(pass_url + '/password', data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
password_request.add_header('Cookie', 'xsrft=%s' % token)
self._download_webpage(password_request, video_id,
@@ -171,7 +201,18 @@ class VimeoIE(SubtitlesInfoExtractor):
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, headers)
- webpage = self._download_webpage(request, video_id)
+ try:
+ webpage = self._download_webpage(request, video_id)
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ errmsg = ee.cause.read()
+ if b'Because of its privacy settings, this video cannot be played here' in errmsg:
+ raise ExtractorError(
+ 'Cannot download embed-only video without embedding '
+ 'URL. Please call youtube-dl with the URL of the page '
+ 'that embeds this video.',
+ expected=True)
+ raise
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
@@ -220,13 +261,16 @@ class VimeoIE(SubtitlesInfoExtractor):
# Extract video thumbnail
video_thumbnail = config["video"].get("thumbnail")
if video_thumbnail is None:
- _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
+ video_thumbs = config["video"].get("thumbs")
+ if video_thumbs and isinstance(video_thumbs, dict):
+ _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1]
# Extract video description
video_description = None
try:
- video_description = get_element_by_attribute("itemprop", "description", webpage)
- if video_description: video_description = clean_html(video_description)
+ video_description = get_element_by_attribute("class", "description_wrapper", webpage)
+ if video_description:
+ video_description = clean_html(video_description)
except AssertionError as err:
# On some pages like (http://player.vimeo.com/video/54469442) the
# html tags are not closed, python 2.6 cannot handle it
@@ -235,6 +279,9 @@ class VimeoIE(SubtitlesInfoExtractor):
else:
raise
+ # Extract video duration
+ video_duration = int_or_none(config["video"].get("duration"))
+
# Extract upload date
video_upload_date = None
mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
@@ -272,7 +319,7 @@ class VimeoIE(SubtitlesInfoExtractor):
file_info = {}
if video_url is None:
video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
- %(video_id, sig, timestamp, quality, codec_name.upper())
+ % (video_id, sig, timestamp, quality, codec_name.upper())
files[key].append({
'ext': codec_extension,
@@ -306,6 +353,7 @@ class VimeoIE(SubtitlesInfoExtractor):
'title': video_title,
'thumbnail': video_thumbnail,
'description': video_description,
+ 'duration': video_duration,
'formats': formats,
'webpage_url': url,
'view_count': view_count,
@@ -317,7 +365,7 @@ class VimeoIE(SubtitlesInfoExtractor):
class VimeoChannelIE(InfoExtractor):
IE_NAME = 'vimeo:channel'
- _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P<id>[^/]+)'
+ _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P<id>[^/]+)/?(\?.*)?$'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
_TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
@@ -331,7 +379,7 @@ class VimeoChannelIE(InfoExtractor):
video_ids = []
for pagenum in itertools.count(1):
webpage = self._download_webpage(
- self._page_url(base_url, pagenum) ,list_id,
+ self._page_url(base_url, pagenum), list_id,
'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
@@ -347,7 +395,7 @@ class VimeoChannelIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
+ channel_id = mobj.group('id')
return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
@@ -414,3 +462,25 @@ class VimeoReviewIE(InfoExtractor):
video_id = mobj.group('id')
player_url = 'https://player.vimeo.com/player/' + video_id
return self.url_result(player_url, 'Vimeo', video_id)
+
+
+class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
+ IE_NAME = 'vimeo:watchlater'
+ IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
+ _VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater'
+ _LOGIN_REQUIRED = True
+ _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _page_url(self, base_url, pagenum):
+ url = '%s/page:%d/' % (base_url, pagenum)
+ request = compat_urllib_request.Request(url)
+ # Set the header to get a partial html page with the ids,
+ # the normal page doesn't contain them.
+ request.add_header('X-Requested-With', 'XMLHttpRequest')
+ return request
+
+ def _real_extract(self, url):
+ return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index e14ff91..076c871 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -1,8 +1,11 @@
from __future__ import unicode_literals
import re
+import json
+import itertools
from .common import InfoExtractor
+from ..utils import unified_strdate
class VineIE(InfoExtractor):
@@ -13,31 +16,76 @@ class VineIE(InfoExtractor):
'info_dict': {
'id': 'b9KOOWX7HUx',
'ext': 'mp4',
- 'uploader': 'Jack Dorsey',
'title': 'Chicken.',
+ 'description': 'Chicken.',
+ 'upload_date': '20130519',
+ 'uploader': 'Jack Dorsey',
+ 'uploader_id': '76',
},
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
-
video_id = mobj.group('id')
- webpage_url = 'https://vine.co/v/' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
- self.report_extraction(video_id)
+ webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
- video_url = self._html_search_meta('twitter:player:stream', webpage,
- 'video URL')
+ data = json.loads(self._html_search_regex(
+ r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
- uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
- webpage, 'uploader', fatal=False, flags=re.DOTALL)
+ formats = [
+ {
+ 'url': data['videoLowURL'],
+ 'ext': 'mp4',
+ 'format_id': 'low',
+ },
+ {
+ 'url': data['videoUrl'],
+ 'ext': 'mp4',
+ 'format_id': 'standard',
+ }
+ ]
return {
'id': video_id,
- 'url': video_url,
- 'ext': 'mp4',
'title': self._og_search_title(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'uploader': uploader,
+ 'description': data['description'],
+ 'thumbnail': data['thumbnailUrl'],
+ 'upload_date': unified_strdate(data['created']),
+ 'uploader': data['username'],
+ 'uploader_id': data['userIdStr'],
+ 'like_count': data['likes']['count'],
+ 'comment_count': data['comments']['count'],
+ 'repost_count': data['reposts']['count'],
+ 'formats': formats,
}
+
+
+class VineUserIE(InfoExtractor):
+ IE_NAME = 'vine:user'
+ _VALID_URL = r'(?:https?://)?vine\.co/(?P<user>[^/]+)/?(\?.*)?$'
+ _VINE_BASE_URL = "https://vine.co/"
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user = mobj.group('user')
+
+ profile_url = "%sapi/users/profiles/vanity/%s" % (
+ self._VINE_BASE_URL, user)
+ profile_data = self._download_json(
+ profile_url, user, note='Downloading user profile data')
+
+ user_id = profile_data['data']['userId']
+ timeline_data = []
+ for pagenum in itertools.count(1):
+ timeline_url = "%sapi/timelines/users/%s?page=%s" % (
+ self._VINE_BASE_URL, user_id, pagenum)
+ timeline_page = self._download_json(
+ timeline_url, user, note='Downloading page %d' % pagenum)
+ timeline_data.extend(timeline_page['data']['records'])
+ if timeline_page['data']['nextPage'] is None:
+ break
+
+ entries = [
+ self.url_result(e['permalinkUrl'], 'Vine') for e in timeline_data]
+ return self.playlist_result(entries, user)
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index a293b88..fb082f3 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -16,7 +16,7 @@ from ..utils import (
class VKIE(InfoExtractor):
IE_NAME = 'vk.com'
- _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+ _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
_NETRC_MACHINE = 'vk'
_TESTS = [
@@ -37,12 +37,24 @@ class VKIE(InfoExtractor):
'info_dict': {
'id': '163339118',
'ext': 'mp4',
- 'uploader': 'Elvira Dzhonik',
+ 'uploader': 'Elya Iskhakova',
'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
'duration': 558,
}
},
{
+ 'note': 'Embedded video',
+ 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
+ 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
+ 'info_dict': {
+ 'id': '162925554',
+ 'ext': 'mp4',
+ 'uploader': 'Vladimir Gavrin',
+ 'title': 'Lin Dan',
+ 'duration': 101,
+ }
+ },
+ {
'url': 'http://vk.com/video-8871596_164049491',
'md5': 'a590bcaf3d543576c9bd162812387666',
'note': 'Only available for registered users',
@@ -54,7 +66,7 @@ class VKIE(InfoExtractor):
'duration': 8352,
},
'skip': 'Requires vk account credentials',
- }
+ },
]
def _login(self):
@@ -82,7 +94,10 @@ class VKIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.group('videoid')
+
+ if not video_id:
+ video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
info_page = self._download_webpage(info_url, video_id)
@@ -93,7 +108,7 @@ class VKIE(InfoExtractor):
m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
if m_yt is not None:
- self.to_screen(u'Youtube video detected')
+ self.to_screen('Youtube video detected')
return self.url_result(m_yt.group(1), 'Youtube')
data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
data = json.loads(data_json)
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index fbdff47..7b77865 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -1,47 +1,69 @@
from __future__ import unicode_literals
import re
-import datetime
from .common import InfoExtractor
+from ..utils import int_or_none
class VubeIE(InfoExtractor):
IE_NAME = 'vube'
IE_DESC = 'Vube.com'
- _VALID_URL = r'http://vube\.com/[^/]+/(?P<id>[\da-zA-Z]{10})'
+ _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b'
- _TEST = {
- 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
- 'md5': 'f81dcf6d0448e3291f54380181695821',
- 'info_dict': {
- 'id': 'YL2qNPkqon',
- 'ext': 'mp4',
- 'title': 'Chiara Grispo - Price Tag by Jessie J',
- 'description': 'md5:8ea652a1f36818352428cb5134933313',
- 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg',
- 'uploader': 'Chiara.Grispo',
- 'uploader_id': '1u3hX0znhP',
- 'upload_date': '20140103',
- 'duration': 170.56
+ _TESTS = [
+ {
+ 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
+ 'md5': 'db7aba89d4603dadd627e9d1973946fe',
+ 'info_dict': {
+ 'id': 'YL2qNPkqon',
+ 'ext': 'mp4',
+ 'title': 'Chiara Grispo - Price Tag by Jessie J',
+ 'description': 'md5:8ea652a1f36818352428cb5134933313',
+ 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg',
+ 'uploader': 'Chiara.Grispo',
+ 'uploader_id': '1u3hX0znhP',
+ 'timestamp': 1388743358,
+ 'upload_date': '20140103',
+ 'duration': 170.56
+ }
+ },
+ {
+ 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1',
+ 'md5': '5d4a52492d76f72712117ce6b0d98d08',
+ 'info_dict': {
+ 'id': 'UeBhTudbfS',
+ 'ext': 'mp4',
+ 'title': 'My 7 year old Sister and I singing "Alive" by Krewella',
+ 'description': 'md5:40bcacb97796339f1690642c21d56f4a',
+ 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102265d5a9f-0f17-4f6b-5753-adf08484ee1e.jpg',
+ 'uploader': 'Seraina',
+ 'uploader_id': 'XU9VE2BQ2q',
+ 'timestamp': 1396492438,
+ 'upload_date': '20140403',
+ 'duration': 240.107
+ }
}
- }
+ ]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- video = self._download_json('http://vube.com/api/v2/video/%s' % video_id,
- video_id, 'Downloading video JSON')
+ video = self._download_json(
+ 'http://vube.com/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
public_id = video['public_id']
- formats = [{'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id),
- 'height': int(fmt['height']),
- 'abr': int(fmt['audio_bitrate']),
- 'vbr': int(fmt['video_bitrate']),
- 'format_id': fmt['media_resolution_id']
- } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed']
+ formats = [
+ {
+ 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id),
+ 'height': int(fmt['height']),
+ 'abr': int(fmt['audio_bitrate']),
+ 'vbr': int(fmt['video_bitrate']),
+ 'format_id': fmt['media_resolution_id']
+ } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed'
+ ]
self._sort_formats(formats)
@@ -52,16 +74,16 @@ class VubeIE(InfoExtractor):
thumbnail = 'http:' + thumbnail
uploader = video['user_alias']
uploader_id = video['user_url_id']
- upload_date = datetime.datetime.fromtimestamp(int(video['upload_time'])).strftime('%Y%m%d')
+ timestamp = int(video['upload_time'])
duration = video['duration']
- view_count = video['raw_view_count']
- like_count = video['total_likes']
- dislike_count= video['total_hates']
+ view_count = video.get('raw_view_count')
+ like_count = video.get('total_likes')
+ dislike_count= video.get('total_hates')
- comment = self._download_json('http://vube.com/api/video/%s/comment' % video_id,
- video_id, 'Downloading video comment JSON')
+ comment = self._download_json(
+ 'http://vube.com/api/video/%s/comment' % video_id, video_id, 'Downloading video comment JSON')
- comment_count = comment['total']
+ comment_count = int_or_none(comment.get('total'))
return {
'id': video_id,
@@ -71,10 +93,10 @@ class VubeIE(InfoExtractor):
'thumbnail': thumbnail,
'uploader': uploader,
'uploader_id': uploader_id,
- 'upload_date': upload_date,
+ 'timestamp': timestamp,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
'comment_count': comment_count,
- } \ No newline at end of file
+ }
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
new file mode 100644
index 0000000..fb0600f
--- /dev/null
+++ b/youtube_dl/extractor/vuclip.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse_urlparse,
+ parse_duration,
+ qualities,
+)
+
+
+class VuClipIE(InfoExtractor):
+ _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
+
+ _TEST = {
+ 'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434',
+ 'md5': '92ac9d1ccefec4f0bb474661ab144fcf',
+ 'info_dict': {
+ 'id': '843902317',
+ 'ext': '3gp',
+ 'title': 'Movie Trailer: Noah',
+ 'duration': 139,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ ad_m = re.search(
+ r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
+ if ad_m:
+ urlr = compat_urllib_parse_urlparse(url)
+ adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1)
+ webpage = self._download_webpage(
+ adfree_url, video_id, note='Download post-ad page')
+
+ links_code = self._search_regex(
+ r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage,
+ 'links')
+ title = self._html_search_regex(
+ r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()
+
+ quality_order = qualities(['Reg', 'Hi'])
+ formats = []
+ for url, q in re.findall(
+ r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code):
+ format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
+ formats.append({
+ 'format_id': format_id,
+ 'url': url,
+ 'quality': quality_order(q),
+ })
+ self._sort_formats(formats)
+
+ duration = parse_duration(self._search_regex(
+ r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
new file mode 100644
index 0000000..cb8f088
--- /dev/null
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ strip_jsonp,
+)
+
+
+class WashingtonPostIE(InfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+ _TEST = {
+ 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+ 'playlist': [{
+ 'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+ 'info_dict': {
+ 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
+ 'ext': 'mp4',
+ 'title': 'Breaking Points: The Paper Mine',
+ 'duration': 1287,
+ 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
+ 'uploader': 'The Washington Post',
+ 'timestamp': 1395527908,
+ 'upload_date': '20140322',
+ },
+ }, {
+ 'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+ 'info_dict': {
+ 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
+ 'ext': 'mp4',
+ 'title': 'The town bureaucracy sustains',
+ 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
+ 'duration': 2217,
+ 'timestamp': 1395528005,
+ 'upload_date': '20140322',
+ 'uploader': 'The Washington Post',
+ },
+ }]
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ page_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, page_id)
+ title = self._og_search_title(webpage)
+ uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
+ entries = []
+ for i, uuid in enumerate(uuids, start=1):
+ vinfo_all = self._download_json(
+ 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
+ page_id,
+ transform_source=strip_jsonp,
+ note='Downloading information of video %d/%d' % (i, len(uuids))
+ )
+ vinfo = vinfo_all[0]['contentConfig']
+ uploader = vinfo.get('credits', {}).get('source')
+ timestamp = int_or_none(
+ vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
+
+ formats = [{
+ 'format_id': (
+ '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
+ if s.get('width')
+ else s.get('type')),
+ 'vbr': s.get('bitrate') if s.get('width') != 0 else None,
+ 'width': s.get('width'),
+ 'height': s.get('height'),
+ 'acodec': s.get('audioCodec'),
+ 'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
+ 'filesize': s.get('fileSize'),
+ 'url': s.get('url'),
+ 'ext': 'mp4',
+ 'protocol': {
+ 'MP4': 'http',
+ 'F4F': 'f4m',
+ }.get(s.get('type'))
+ } for s in vinfo.get('streams', [])]
+ source_media_url = vinfo.get('sourceMediaURL')
+ if source_media_url:
+ formats.append({
+ 'format_id': 'source_media',
+ 'url': source_media_url,
+ })
+ self._sort_formats(formats)
+ entries.append({
+ 'id': uuid,
+ 'title': vinfo['title'],
+ 'description': vinfo.get('blurb'),
+ 'uploader': uploader,
+ 'formats': formats,
+ 'duration': int_or_none(vinfo.get('videoDuration'), 100),
+ 'timestamp': timestamp,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'id': page_id,
+ 'title': title,
+ }
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 4fab6c6..a584e08 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -1,37 +1,37 @@
# coding: utf-8
+from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
-
from ..utils import (
unified_strdate,
)
class WatIE(InfoExtractor):
- _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
+ _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
IE_NAME = 'wat.tv'
_TEST = {
- u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
- u'file': u'10631273.mp4',
- u'md5': u'd8b2231e1e333acd12aad94b80937e19',
- u'info_dict': {
- u'title': u'World War Z - Philadelphia VOST',
- u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+ 'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
+ 'info_dict': {
+ 'id': '10631273',
+ 'ext': 'mp4',
+ 'title': 'World War Z - Philadelphia VOST',
+ 'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+ },
+ 'params': {
+ # Sometimes wat serves the whole file with the --test option
+ 'skip_download': True,
},
- u'skip': u'Sometimes wat serves the whole file with the --test option',
}
-
+
def download_video_info(self, real_id):
# 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them
- info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
- info = json.loads(info)
+ info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
return info['media']
-
def _real_extract(self, url):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
@@ -56,17 +56,17 @@ class WatIE(InfoExtractor):
entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
return self.playlist_result(entries, real_id, video_info['title'])
+ upload_date = None
+ if 'date_diffusion' in first_chapter:
+ upload_date = unified_strdate(first_chapter['date_diffusion'])
# Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url
- info = {'id': real_id,
- 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
- 'ext': 'mp4',
- 'title': first_chapter['title'],
- 'thumbnail': first_chapter['preview'],
- 'description': first_chapter['description'],
- 'view_count': video_info['views'],
- }
- if 'date_diffusion' in first_chapter:
- info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
-
- return info
+ return {
+ 'id': real_id,
+ 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
+ 'title': first_chapter['title'],
+ 'thumbnail': first_chapter['preview'],
+ 'description': first_chapter['description'],
+ 'view_count': video_info['views'],
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
new file mode 100644
index 0000000..feeb44b
--- /dev/null
+++ b/youtube_dl/extractor/wdr.py
@@ -0,0 +1,224 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_parse_qs,
+ compat_urlparse,
+ determine_ext,
+ unified_strdate,
+)
+
+
+class WDRIE(InfoExtractor):
+ _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
+ _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+
+ _TESTS = [
+ {
+ 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html',
+ 'info_dict': {
+ 'id': 'mdb-362427',
+ 'ext': 'flv',
+ 'title': 'Servicezeit',
+ 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb',
+ 'upload_date': '20140310',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html',
+ 'info_dict': {
+ 'id': 'mdb-363194',
+ 'ext': 'flv',
+ 'title': 'Marga Spiegel ist tot',
+ 'description': 'md5:2309992a6716c347891c045be50992e4',
+ 'upload_date': '20140311',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html',
+ 'md5': '83e9e8fefad36f357278759870805898',
+ 'info_dict': {
+ 'id': 'mdb-194332',
+ 'ext': 'mp3',
+ 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
+ 'description': 'md5:2309992a6716c347891c045be50992e4',
+ 'upload_date': '20091129',
+ },
+ },
+ {
+ 'url': 'http://www.funkhauseuropa.de/av/audiogrenzenlosleckerbaklava101-audioplayer.html',
+ 'md5': 'cfff440d4ee64114083ac44676df5d15',
+ 'info_dict': {
+ 'id': 'mdb-363068',
+ 'ext': 'mp3',
+ 'title': 'Grenzenlos lecker - Baklava',
+ 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
+ 'upload_date': '20140311',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ page_url = mobj.group('url')
+ page_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, page_id)
+
+ if mobj.group('player') is None:
+ entries = [
+ self.url_result(page_url + href, 'WDR')
+ for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
+ ]
+ return self.playlist_result(entries, page_id)
+
+ flashvars = compat_urlparse.parse_qs(
+ self._html_search_regex(r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+
+ page_id = flashvars['trackerClipId'][0]
+ video_url = flashvars['dslSrc'][0]
+ title = flashvars['trackerClipTitle'][0]
+ thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
+
+ if 'trackerClipAirTime' in flashvars:
+ upload_date = flashvars['trackerClipAirTime'][0]
+ else:
+ upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
+
+ if upload_date:
+ upload_date = unified_strdate(upload_date)
+
+ if video_url.endswith('.f4m'):
+ video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
+ ext = 'flv'
+ else:
+ ext = determine_ext(video_url)
+
+ description = self._html_search_meta('Description', webpage, 'description')
+
+ return {
+ 'id': page_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
+
+
+class WDRMobileIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://mobile-ondemand\.wdr\.de/
+ .*?/fsk(?P<age_limit>[0-9]+)
+ /[0-9]+/[0-9]+/
+ (?P<id>[0-9]+)_(?P<title>[0-9]+)'''
+ IE_NAME = 'wdr:mobile'
+ _TEST = {
+ 'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4',
+ 'info_dict': {
+ 'title': '4283021',
+ 'id': '421735',
+ 'age_limit': 0,
+ },
+ '_skip': 'Will be depublicized shortly'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ return {
+ 'id': mobj.group('id'),
+ 'title': mobj.group('title'),
+ 'age_limit': int(mobj.group('age_limit')),
+ 'url': url,
+ 'user_agent': 'mobile',
+ }
+
+
+class WDRMausIE(InfoExtractor):
+ _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
+ IE_DESC = 'Sendung mit der Maus'
+ _TESTS = [{
+ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
+ 'info_dict': {
+ 'id': 'aktuelle-sendung',
+ 'ext': 'mp4',
+ 'thumbnail': 're:^http://.+\.jpg',
+ 'upload_date': 're:^[0-9]{8}$',
+ 'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
+ }
+ }, {
+ 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
+ 'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
+ 'info_dict': {
+ 'id': '40_jahre_maus',
+ 'ext': 'mp4',
+ 'thumbnail': 're:^http://.+\.jpg',
+ 'upload_date': '20131007',
+ 'title': '12.03.2011 - 40 Jahre Maus',
+ }
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ param_code = self._html_search_regex(
+ r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
+
+ title_date = self._search_regex(
+ r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
+ webpage, 'air date')
+ title_str = self._html_search_regex(
+ r'<h1>(.*?)</h1>', webpage, 'title')
+ title = '%s - %s' % (title_date, title_str)
+ upload_date = unified_strdate(
+ self._html_search_meta('dc.date', webpage))
+
+ fields = compat_parse_qs(param_code)
+ video_url = fields['firstVideo'][0]
+ thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
+
+ formats = [{
+ 'format_id': 'rtmp',
+ 'url': video_url,
+ }]
+
+ jscode = self._download_webpage(
+ 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
+ video_id, fatal=False,
+ note='Downloading URL translation table',
+ errnote='Could not download URL translation table')
+ if jscode:
+ for m in re.finditer(
+ r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
+ jscode):
+ if video_url.startswith(m.group('stream')):
+ http_url = video_url.replace(
+ m.group('stream'), m.group('dl'))
+ formats.append({
+ 'format_id': 'http',
+ 'url': http_url,
+ })
+ break
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
+
+# TODO test _1 \ No newline at end of file
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py
index fa784ab..b24297a 100644
--- a/youtube_dl/extractor/weibo.py
+++ b/youtube_dl/extractor/weibo.py
@@ -1,10 +1,11 @@
# coding: utf-8
+from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
+
class WeiboIE(InfoExtractor):
"""
The videos in Weibo come from different sites, this IE just finds the link
@@ -13,16 +14,16 @@ class WeiboIE(InfoExtractor):
_VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
_TEST = {
- u'add_ie': ['Sina'],
- u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
- u'file': u'98322879.flv',
- u'info_dict': {
- u'title': u'魔声耳机最新广告“All Eyes On Us”',
+ 'url': 'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
+ 'info_dict': {
+ 'id': '98322879',
+ 'ext': 'flv',
+ 'title': '魔声耳机最新广告“All Eyes On Us”',
},
- u'note': u'Sina video',
- u'params': {
- u'skip_download': True,
+ 'params': {
+ 'skip_download': True,
},
+ 'add_ie': ['Sina'],
}
# Additional example videos from different sites
@@ -33,17 +34,16 @@ class WeiboIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
video_id = mobj.group('id')
info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
- info_page = self._download_webpage(info_url, video_id)
- info = json.loads(info_page)
+ info = self._download_json(info_url, video_id)
videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
- #Prefer sina video since they have thumbnails
- videos_urls = sorted(videos_urls, key=lambda u: u'video.sina.com' in u)
+ # Prefer sina video since they have thumbnails
+ videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u)
player_url = videos_urls[-1]
- m_sina = re.match(r'https?://video.sina.com.cn/v/b/(\d+)-\d+.html', player_url)
+ m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html',
+ player_url)
if m_sina is not None:
self.to_screen('Sina video detected')
sina_id = m_sina.group(1)
player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
return self.url_result(player_url)
-
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index 9a6bb0c..c27dda9 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -3,19 +3,34 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
class WimpIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/'
- _TEST = {
- 'url': 'http://www.wimp.com/deerfence/',
- 'file': 'deerfence.flv',
- 'md5': '8b215e2e0168c6081a1cf84b2846a2b5',
+ _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
+ _TESTS = [{
+ 'url': 'http://www.wimp.com/maruexhausted/',
+ 'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
'info_dict': {
- "title": "Watch Till End: Herd of deer jump over a fence.",
- "description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.",
+ 'id': 'maruexhausted',
+ 'ext': 'flv',
+ 'title': 'Maru is exhausted.',
+ 'description': 'md5:57e099e857c0a4ea312542b684a869b8',
}
- }
+ }, {
+ # youtube video
+ 'url': 'http://www.wimp.com/clowncar/',
+ 'info_dict': {
+ 'id': 'cG4CEr2aiSg',
+ 'ext': 'mp4',
+ 'title': 'Basset hound clown car...incredible!',
+ 'description': 'md5:8d228485e0719898c017203f900b3a35',
+ 'uploader': 'Gretchen Hoey',
+ 'uploader_id': 'gretchenandjeff1',
+ 'upload_date': '20140303',
+ },
+ 'add_ie': ['Youtube'],
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -23,6 +38,13 @@ class WimpIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL')
+ if YoutubeIE.suitable(video_url):
+ self.to_screen('Found YouTube video')
+ return {
+ '_type': 'url',
+ 'url': video_url,
+ 'ie_key': YoutubeIE.ie_key(),
+ }
return {
'id': video_id,
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index 3237596..4e89acd 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
@@ -7,14 +9,14 @@ class WorldStarHipHopIE(InfoExtractor):
_VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
_TEST = {
"url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
- "file": "wshh6a7q1ny0G34ZwuIO.mp4",
"md5": "9d04de741161603bf7071bbf4e883186",
"info_dict": {
+ "id": "wshh6a7q1ny0G34ZwuIO",
+ "ext": "mp4",
"title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
}
}
-
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
@@ -22,42 +24,33 @@ class WorldStarHipHopIE(InfoExtractor):
webpage_src = self._download_webpage(url, video_id)
m_vevo_id = re.search(r'videoId=(.*?)&amp?',
- webpage_src)
-
+ webpage_src)
if m_vevo_id is not None:
- self.to_screen(u'Vevo video detected:')
return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
- video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
- webpage_src, u'video URL')
+ video_url = self._search_regex(
+ r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL')
if 'youtube' in video_url:
- self.to_screen(u'Youtube video detected:')
return self.url_result(video_url, ie='Youtube')
- if 'mp4' in video_url:
- ext = 'mp4'
- else:
- ext = 'flv'
-
- video_title = self._html_search_regex(r"<title>(.*)</title>",
- webpage_src, u'title')
+ video_title = self._html_search_regex(
+ r"<title>(.*)</title>", webpage_src, 'title')
# Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
- webpage_src, u'thumbnail', fatal=False)
-
+ thumbnail = self._html_search_regex(
+ r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail',
+ fatal=False)
if not thumbnail:
_title = r"""candytitles.*>(.*)</span>"""
mobj = re.search(_title, webpage_src)
if mobj is not None:
video_title = mobj.group(1)
- results = [{
- 'id': video_id,
- 'url' : video_url,
- 'title' : video_title,
- 'thumbnail' : thumbnail,
- 'ext' : ext,
- }]
- return results
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'thumbnail': thumbnail,
+ }
+
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
new file mode 100644
index 0000000..71bd7c4
--- /dev/null
+++ b/youtube_dl/extractor/xbef.py
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+)
+
+
+class XBefIE(InfoExtractor):
+ _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking',
+ 'md5': 'a478b565baff61634a98f5e5338be995',
+ 'info_dict': {
+ 'id': '5119',
+ 'ext': 'mp4',
+ 'title': 'md5:7358a9faef8b7b57acda7c04816f170e',
+ 'age_limit': 18,
+ 'thumbnail': 're:^http://.*\.jpg',
+ }
+ }
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(
+ r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
+
+ config_url_enc = self._download_webpage(
+ 'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
+ note='Retrieving config URL')
+ config_url = compat_urllib_parse.unquote(config_url_enc)
+ config = self._download_xml(
+ config_url, video_id, note='Retrieving config')
+
+ video_url = config.find('./file').text
+ thumbnail = config.find('./image').text
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'age_limit': 18,
+ }
+
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index f6c515f..5374495 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -4,51 +4,51 @@ import re
from .common import InfoExtractor
from ..utils import (
- compat_urllib_parse,
ExtractorError,
+ unified_strdate,
+ str_to_int,
+ int_or_none,
+ parse_duration,
)
class XHamsterIE(InfoExtractor):
"""Information Extractor for xHamster"""
- _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
- _TESTS = [{
- 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
- 'file': '1509445.mp4',
- 'md5': '8281348b8d3c53d39fffb377d24eac4e',
- 'info_dict': {
- "upload_date": "20121014",
- "uploader_id": "Ruseful2011",
- "title": "FemaleAgent Shy beauty takes the bait",
- "age_limit": 18,
- }
- },
- {
- 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
- 'file': '2221348.flv',
- 'md5': 'e767b9475de189320f691f49c679c4c7',
- 'info_dict': {
- "upload_date": "20130914",
- "uploader_id": "jojo747400",
- "title": "Britney Spears Sexy Booty",
- "age_limit": 18,
+ _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
+ _TESTS = [
+ {
+ 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
+ 'md5': '8281348b8d3c53d39fffb377d24eac4e',
+ 'info_dict': {
+ 'id': '1509445',
+ 'ext': 'mp4',
+ 'title': 'FemaleAgent Shy beauty takes the bait',
+ 'upload_date': '20121014',
+ 'uploader_id': 'Ruseful2011',
+ 'duration': 893,
+ 'age_limit': 18,
+ }
+ },
+ {
+ 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
+ 'md5': '4cbd8d56708ecb4fb4124c23e4acb81a',
+ 'info_dict': {
+ 'id': '2221348',
+ 'ext': 'mp4',
+ 'title': 'Britney Spears Sexy Booty',
+ 'upload_date': '20130914',
+ 'uploader_id': 'jojo747400',
+ 'duration': 200,
+ 'age_limit': 18,
+ }
}
- }]
+ ]
def _real_extract(self,url):
def extract_video_url(webpage):
- mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
- if mobj is None:
- raise ExtractorError('Unable to extract media URL')
- if len(mobj.group('server')) == 0:
- return compat_urllib_parse.unquote(mobj.group('file'))
- else:
- return mobj.group('server')+'/key='+mobj.group('file')
-
- def extract_mp4_video_url(webpage):
- mp4 = re.search(r'<a href=\"(.+?)\" class=\"mp4Play\"',webpage)
+ mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
if mp4 is None:
- return None
+ raise ExtractorError('Unable to extract media URL')
else:
return mp4.group(1)
@@ -62,50 +62,49 @@ class XHamsterIE(InfoExtractor):
mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
webpage = self._download_webpage(mrss_url, video_id)
- video_title = self._html_search_regex(
- r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
+ title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
# Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
- video_description = mobj.group(1) if mobj else None
+ description = mobj.group(1) if mobj else None
- mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
- if mobj:
- video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
- else:
- video_upload_date = None
- self._downloader.report_warning('Unable to extract upload date')
+ upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
+ webpage, 'upload date', fatal=False)
+ if upload_date:
+ upload_date = unified_strdate(upload_date)
- video_uploader_id = self._html_search_regex(
- r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
+ uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
webpage, 'uploader id', default='anonymous')
- video_thumbnail = self._search_regex(
- r'\'image\':\'(?P<thumbnail>[^\']+)\'',
- webpage, 'thumbnail', fatal=False)
+ thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
+
+ duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
+ webpage, 'duration', fatal=False))
+
+ view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = str_to_int(view_count)
+
+ mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
+ (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
+
+ mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
+ comment_count = mobj.group('commentcount') if mobj else 0
age_limit = self._rta_search(webpage)
hd = is_hd(webpage)
+
video_url = extract_video_url(webpage)
formats = [{
'url': video_url,
'format_id': 'hd' if hd else 'sd',
- 'preference': 0,
+ 'preference': 1,
}]
- video_mp4_url = extract_mp4_video_url(webpage)
- if video_mp4_url is not None:
- formats.append({
- 'url': video_mp4_url,
- 'ext': 'mp4',
- 'format_id': 'mp4-hd' if hd else 'mp4-sd',
- 'preference': 1,
- })
-
if not hd:
- webpage = self._download_webpage(
- mrss_url + '?hd', video_id, note='Downloading HD webpage')
+ mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
+ webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
if is_hd(webpage):
video_url = extract_video_url(webpage)
formats.append({
@@ -118,11 +117,16 @@ class XHamsterIE(InfoExtractor):
return {
'id': video_id,
- 'title': video_title,
- 'formats': formats,
- 'description': video_description,
- 'upload_date': video_upload_date,
- 'uploader_id': video_uploader_id,
- 'thumbnail': video_thumbnail,
+ 'title': title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'uploader_id': uploader_id,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'like_count': int_or_none(like_count),
+ 'dislike_count': int_or_none(dislike_count),
+ 'comment_count': int_or_none(comment_count),
'age_limit': age_limit,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 1177a4b..7a73b24 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -1,55 +1,49 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
-
- ExtractorError,
)
class XNXXIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'
- VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
- VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
- VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
+ _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
_TEST = {
- u'url': u'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
- u'file': u'1135332.flv',
- u'md5': u'0831677e2b4761795f68d417e0b7b445',
- u'info_dict': {
- u"title": u"lida \u00bb Naked Funny Actress (5)",
- u"age_limit": 18,
+ 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
+ 'md5': '0831677e2b4761795f68d417e0b7b445',
+ 'info_dict': {
+ 'id': '1135332',
+ 'ext': 'flv',
+ 'title': 'lida » Naked Funny Actress (5)',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- video_id = mobj.group(1)
+ video_id = mobj.group('id')
# Get webpage content
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(self.VIDEO_URL_RE,
- webpage, u'video URL')
+ video_url = self._search_regex(r'flv_url=(.*?)&amp;',
+ webpage, 'video URL')
video_url = compat_urllib_parse.unquote(video_url)
- video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
- webpage, u'title')
+ video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
+ webpage, 'title')
- video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
- webpage, u'thumbnail', fatal=False)
+ video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&amp;',
+ webpage, 'thumbnail', fatal=False)
- return [{
+ return {
'id': video_id,
'url': video_url,
- 'uploader': None,
- 'upload_date': None,
'title': video_title,
'ext': 'flv',
'thumbnail': video_thumbnail,
- 'description': None,
'age_limit': 18,
- }]
+ }
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 9826199..b293e26 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -1,25 +1,29 @@
from __future__ import unicode_literals
-import os
import re
+import json
from .common import InfoExtractor
from ..utils import (
- compat_urllib_parse_urlparse,
compat_urllib_request,
+ parse_duration,
+ str_to_int,
)
+
class XTubeIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
_TEST = {
'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
- 'file': 'kVTUy_G222_.mp4',
'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
'info_dict': {
- "title": "strange erotica",
- "description": "surreal gay themed erotica...almost an ET kind of thing",
- "uploader": "greenshowers",
- "age_limit": 18,
+ 'id': 'kVTUy_G222_',
+ 'ext': 'mp4',
+ 'title': 'strange erotica',
+ 'description': 'surreal gay themed erotica...almost an ET kind of thing',
+ 'uploader': 'greenshowers',
+ 'duration': 450,
+ 'age_limit': 18,
}
}
@@ -32,25 +36,79 @@ class XTubeIE(InfoExtractor):
req.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(req, video_id)
- video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
- video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
- video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
- video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
- path = compat_urllib_parse_urlparse(video_url).path
- extension = os.path.splitext(path)[1][1:]
- format = path.split('/')[5].split('_')[:2]
- format[0] += 'p'
- format[1] += 'k'
- format = "-".join(format)
+ video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
+ video_uploader = self._html_search_regex(
+ r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
+ video_description = self._html_search_regex(
+ r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
+ duration = parse_duration(self._html_search_regex(
+ r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
+ view_count = self._html_search_regex(
+ r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
+ if view_count:
+ view_count = str_to_int(view_count)
+ comment_count = self._html_search_regex(
+ r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
+ if comment_count:
+ comment_count = str_to_int(comment_count)
+
+ player_quality_option = json.loads(self._html_search_regex(
+ r'playerQualityOption = ({.+?});', webpage, 'player quality option'))
+
+ QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080']
+ formats = [
+ {
+ 'url': furl,
+ 'format_id': format_id,
+ 'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1,
+ } for format_id, furl in player_quality_option.items()
+ ]
+ self._sort_formats(formats)
return {
'id': video_id,
'title': video_title,
'uploader': video_uploader,
'description': video_description,
- 'url': video_url,
- 'ext': extension,
- 'format': format,
- 'format_id': format,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
'age_limit': 18,
}
+
+class XTubeUserIE(InfoExtractor):
+ IE_DESC = 'XTube user profile'
+ _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ username = mobj.group('username')
+
+ profile_page = self._download_webpage(
+ url, username, note='Retrieving profile page')
+
+ video_count = int(self._search_regex(
+ r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page,
+ 'video count'))
+
+ PAGE_SIZE = 25
+ urls = []
+ page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+ for n in range(1, page_count + 1):
+ lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
+ lpage = self._download_webpage(
+ lpage_url, username,
+ note='Downloading page %d/%d' % (n, page_count))
+ urls.extend(
+ re.findall(r'addthis:url="([^"]+)"', lpage))
+
+ return {
+ '_type': 'playlist',
+ 'id': username,
+ 'entries': [{
+ '_type': 'url',
+ 'url': eurl,
+ 'ie_key': 'XTube',
+ } for eurl in urls]
+ }
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 85e99e1..7e00448 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -5,18 +5,21 @@ import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
+ ExtractorError,
+ clean_html,
)
class XVideosIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
_TEST = {
- 'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1',
- 'file': '939581.flv',
- 'md5': '1d0c835822f0a71a7bf011855db929d0',
+ 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
+ 'md5': '4b46ae6ea5e6e9086e714d883313c0c9',
'info_dict': {
- "title": "Funny Porns By >>>>S<<<<<< -1",
- "age_limit": 18,
+ 'id': '4588838',
+ 'ext': 'flv',
+ 'title': 'Biker Takes his Girl',
+ 'age_limit': 18,
}
}
@@ -28,6 +31,10 @@ class XVideosIE(InfoExtractor):
self.report_extraction(video_id)
+ mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
+ if mobj:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
+
# Extract video URL
video_url = compat_urllib_parse.unquote(
self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index d92d14f..d84be25 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -14,27 +14,39 @@ from ..utils import (
class YahooIE(InfoExtractor):
- IE_DESC = 'Yahoo screen'
- _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
+ IE_DESC = 'Yahoo screen and movies'
+ _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- 'file': '214727115.mp4',
'md5': '4962b075c08be8690a922ee026d05e69',
'info_dict': {
+ 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
+ 'ext': 'mp4',
'title': 'Julian Smith & Travis Legg Watch Julian Smith',
'description': 'Julian and Travis watch Julian Smith',
},
},
{
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- 'file': '103000935.mp4',
'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
'info_dict': {
+ 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
+ 'ext': 'mp4',
'title': 'Codefellas - The Cougar Lies with Spanish Moss',
'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
},
+ {
+ 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html',
+ 'md5': '410b7104aa9893b765bc22787a22f3d9',
+ 'info_dict': {
+ 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845',
+ 'ext': 'mp4',
+ 'title': 'The World Loves Spider-Man',
+ 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
+ }
+ }
]
def _real_extract(self, url):
@@ -42,16 +54,25 @@ class YahooIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- items_json = self._search_regex(r'mediaItems: ({.*?})$',
- webpage, 'items', flags=re.MULTILINE)
- items = json.loads(items_json)
- info = items['mediaItems']['query']['results']['mediaObj'][0]
- # The 'meta' field is not always in the video webpage, we request it
- # from another page
- long_id = info['id']
- return self._get_info(long_id, video_id)
-
- def _get_info(self, long_id, video_id):
+ items_json = self._search_regex(
+ r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
+ default=None)
+ if items_json is None:
+ CONTENT_ID_REGEXES = [
+ r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
+ r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"'
+ ]
+ long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
+ video_id = long_id
+ else:
+ items = json.loads(items_json)
+ info = items['mediaItems']['query']['results']['mediaObj'][0]
+ # The 'meta' field is not always in the video webpage, we request it
+ # from another page
+ long_id = info['id']
+ return self._get_info(long_id, video_id, webpage)
+
+ def _get_info(self, long_id, video_id, webpage):
query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
' AND protocol="http"' % long_id)
@@ -60,10 +81,9 @@ class YahooIE(InfoExtractor):
'env': 'prod',
'format': 'json',
})
- query_result_json = self._download_webpage(
+ query_result = self._download_json(
'http://video.query.yahoo.com/v1/public/yql?' + data,
video_id, 'Downloading video info')
- query_result = json.loads(query_result_json)
info = query_result['query']['results']['mediaObj'][0]
meta = info['meta']
@@ -86,7 +106,6 @@ class YahooIE(InfoExtractor):
else:
format_url = compat_urlparse.urljoin(host, path)
format_info['url'] = format_url
-
formats.append(format_info)
self._sort_formats(formats)
@@ -96,7 +115,7 @@ class YahooIE(InfoExtractor):
'title': meta['title'],
'formats': formats,
'description': clean_html(meta['description']),
- 'thumbnail': meta['thumbnail'],
+ 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
}
@@ -104,7 +123,7 @@ class YahooNewsIE(YahooIE):
IE_NAME = 'yahoo:news'
_VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '67010fdf3a08d290e060a4dd96baa07b',
'info_dict': {
@@ -113,17 +132,14 @@ class YahooNewsIE(YahooIE):
'title': 'China Moses Is Crazy About the Blues',
'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
},
- }
-
- # Overwrite YahooIE properties we don't want
- _TESTS = []
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
- return self._get_info(long_id, video_id)
+ return self._get_info(long_id, video_id, webpage)
class YahooSearchIE(SearchInfoExtractor):
@@ -134,27 +150,25 @@ class YahooSearchIE(SearchInfoExtractor):
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
-
- res = {
- '_type': 'playlist',
- 'id': query,
- 'entries': []
- }
- for pagenum in itertools.count(0):
+ entries = []
+ for pagenum in itertools.count(0):
result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
- webpage = self._download_webpage(result_url, query,
- note='Downloading results page '+str(pagenum+1))
- info = json.loads(webpage)
+ info = self._download_json(result_url, query,
+ note='Downloading results page '+str(pagenum+1))
m = info['m']
results = info['results']
for (i, r) in enumerate(results):
- if (pagenum * 30) +i >= n:
+ if (pagenum * 30) + i >= n:
break
mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
- res['entries'].append(e)
- if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):
+ entries.append(e)
+ if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
break
- return res
+ return {
+ '_type': 'playlist',
+ 'id': query,
+ 'entries': entries,
+ }
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 77ad423..d456c4d 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+
import json
import re
import sys
@@ -17,24 +20,25 @@ from ..aes import (
class YouPornIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
+ _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
_TEST = {
- u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
- u'file': u'505835.mp4',
- u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
- u'info_dict': {
- u"upload_date": u"20101221",
- u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
- u"uploader": u"Ask Dan And Jennifer",
- u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
- u"age_limit": 18,
+ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+ 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
+ 'info_dict': {
+ 'id': '505835',
+ 'ext': 'mp4',
+ 'upload_date': '20101221',
+ 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
+ 'uploader': 'Ask Dan And Jennifer',
+ 'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
+ 'age_limit': 18,
}
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- url = 'http://www.' + mobj.group('url')
+ url = mobj.group('proto') + 'www.' + mobj.group('url')
req = compat_urllib_request.Request(url)
req.add_header('Cookie', 'age_verified=1')
@@ -42,7 +46,7 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage)
# Get JSON parameters
- json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+ json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters')
try:
params = json.loads(json_params)
except:
@@ -61,7 +65,7 @@ class YouPornIE(InfoExtractor):
# Get all of the links from the page
DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
- webpage, u'download list').strip()
+ webpage, 'download list').strip()
LINK_RE = r'<a href="([^"]+)">'
links = re.findall(LINK_RE, download_list_html)
@@ -86,7 +90,7 @@ class YouPornIE(InfoExtractor):
resolution = format_parts[0]
height = int(resolution[:-len('p')])
bitrate = int(format_parts[1][:-len('k')])
- format = u'-'.join(format_parts) + u'-' + dn
+ format = '-'.join(format_parts) + '-' + dn
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index a810368..7c50881 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -7,13 +7,13 @@ import itertools
import json
import os.path
import re
-import string
import struct
import traceback
import zlib
from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor
+from ..jsinterp import JSInterpreter
from ..utils import (
compat_chr,
compat_parse_qs,
@@ -29,7 +29,6 @@ from ..utils import (
ExtractorError,
int_or_none,
PagedList,
- RegexNotFoundError,
unescapeHTML,
unified_strdate,
orderedSet,
@@ -138,19 +137,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
+ (?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
- (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+ (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
))
|youtu\.be/ # just youtu.be/xxxx
+ |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
@@ -176,32 +177,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# 3d videos
- '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
- '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
- '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
- '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
- '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
- '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
- '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
+ '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
+ '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
+ '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
+ '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
+ '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
+ '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
+ '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
# Apple HTTP Live Streaming
- '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
- '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
- '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
- '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
- '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
- '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
- '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
+ '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
+ '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
+ '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
+ '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
+ '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
+ '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
+ '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
# DASH mp4 video
- '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
- '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
- '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
- '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
- '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
- '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
- '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
- '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+ '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
@@ -209,23 +210,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
# Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
- '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
- '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
- '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
- '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
- '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
- '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+ '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
# Dash webm audio
- '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
- '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
+ '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
+ '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
@@ -241,7 +243,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"uploader": u"Philipp Hagemeister",
u"uploader_id": u"phihag",
u"upload_date": u"20121002",
- u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+ u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
+ u"categories": [u'Science & Technology'],
}
},
{
@@ -251,7 +254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"info_dict": {
u"upload_date": u"20120506",
u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
- u"description": u"md5:5b292926389560516e384ac437c0ec07",
+ u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
u"uploader": u"Icona Pop",
u"uploader_id": u"IconaPop"
}
@@ -296,6 +299,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
u"format": "141",
},
},
+ # DASH manifest with encrypted signature
+ {
+ u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+ u'info_dict': {
+ u'id': u'IB3lcPjvWLA',
+ u'ext': u'm4a',
+ u'title': u'Afrojack - The Spark ft. Spree Wilson',
+ u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
+ u'uploader': u'AfrojackVEVO',
+ u'uploader_id': u'AfrojackVEVO',
+ u'upload_date': u'20131011',
+ },
+ u"params": {
+ u'youtube_include_dash_manifest': True,
+ u'format': '141',
+ },
+ },
]
@@ -421,113 +441,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
r'signature=([a-zA-Z]+)', jscode,
- u'Initial JS player signature function name')
-
- functions = {}
-
- def argidx(varname):
- return string.lowercase.index(varname)
-
- def interpret_statement(stmt, local_vars, allow_recursion=20):
- if allow_recursion < 0:
- raise ExtractorError(u'Recursion limit reached')
-
- if stmt.startswith(u'var '):
- stmt = stmt[len(u'var '):]
- ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
- r'=(?P<expr>.*)$', stmt)
- if ass_m:
- if ass_m.groupdict().get('index'):
- def assign(val):
- lvar = local_vars[ass_m.group('out')]
- idx = interpret_expression(ass_m.group('index'),
- local_vars, allow_recursion)
- assert isinstance(idx, int)
- lvar[idx] = val
- return val
- expr = ass_m.group('expr')
- else:
- def assign(val):
- local_vars[ass_m.group('out')] = val
- return val
- expr = ass_m.group('expr')
- elif stmt.startswith(u'return '):
- assign = lambda v: v
- expr = stmt[len(u'return '):]
- else:
- raise ExtractorError(
- u'Cannot determine left side of statement in %r' % stmt)
-
- v = interpret_expression(expr, local_vars, allow_recursion)
- return assign(v)
-
- def interpret_expression(expr, local_vars, allow_recursion):
- if expr.isdigit():
- return int(expr)
-
- if expr.isalpha():
- return local_vars[expr]
-
- m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
- if m:
- member = m.group('member')
- val = local_vars[m.group('in')]
- if member == 'split("")':
- return list(val)
- if member == 'join("")':
- return u''.join(val)
- if member == 'length':
- return len(val)
- if member == 'reverse()':
- return val[::-1]
- slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
- if slice_m:
- idx = interpret_expression(
- slice_m.group('idx'), local_vars, allow_recursion-1)
- return val[idx:]
-
- m = re.match(
- r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
- if m:
- val = local_vars[m.group('in')]
- idx = interpret_expression(m.group('idx'), local_vars,
- allow_recursion-1)
- return val[idx]
-
- m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
- if m:
- a = interpret_expression(m.group('a'),
- local_vars, allow_recursion)
- b = interpret_expression(m.group('b'),
- local_vars, allow_recursion)
- return a % b
-
- m = re.match(
- r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
- if m:
- fname = m.group('func')
- if fname not in functions:
- functions[fname] = extract_function(fname)
- argvals = [int(v) if v.isdigit() else local_vars[v]
- for v in m.group('args').split(',')]
- return functions[fname](argvals)
- raise ExtractorError(u'Unsupported JS expression %r' % expr)
-
- def extract_function(funcname):
- func_m = re.search(
- r'function ' + re.escape(funcname) +
- r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
- jscode)
- argnames = func_m.group('args').split(',')
-
- def resf(args):
- local_vars = dict(zip(argnames, args))
- for stmt in func_m.group('code').split(';'):
- res = interpret_statement(stmt, local_vars)
- return res
- return resf
-
- initial_function = extract_function(funcname)
+ u'Initial JS player signature function name')
+
+ jsi = JSInterpreter(jscode)
+ initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s])
def _parse_sig_swf(self, file_contents):
@@ -1113,14 +1030,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
def _real_extract(self, url):
+ proto = (
+ u'http' if self._downloader.params.get('prefer_insecure', False)
+ else u'https')
+
# Extract original video URL from URL with redirection, like age verification, using next_url parameter
mobj = re.search(self._NEXT_URL_RE, url)
if mobj:
- url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url)
# Get video webpage
- url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+ url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
video_webpage = self._download_webpage(url, video_id)
# Attempt to extract SWF player URL
@@ -1145,7 +1066,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'asv': 3,
'sts':'1588',
})
- video_info_url = 'https://www.youtube.com/get_video_info?' + data
+ video_info_url = proto + '://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
@@ -1153,7 +1074,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
else:
age_gate = False
for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
% (video_id, el_type))
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
@@ -1163,9 +1084,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
break
if 'token' not in video_info:
if 'reason' in video_info:
- raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
+ raise ExtractorError(
+ u'YouTube said: %s' % video_info['reason'][0],
+ expected=True, video_id=video_id)
else:
- raise ExtractorError(u'"token" parameter not in video info for unknown reason')
+ raise ExtractorError(
+ u'"token" parameter not in video info for unknown reason',
+ video_id=video_id)
if 'view_count' in video_info:
view_count = int(video_info['view_count'][0])
@@ -1194,7 +1119,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# title
if 'title' in video_info:
- video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
+ video_title = video_info['title'][0]
else:
self._downloader.report_warning(u'Unable to extract video title')
video_title = u'_'
@@ -1213,11 +1138,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# upload date
upload_date = None
- mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
+ mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
+ if mobj is None:
+ mobj = re.search(
+ r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
+ video_webpage)
if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
+ m_cat_container = get_element_by_id("eow-category", video_webpage)
+ if m_cat_container:
+ category = self._html_search_regex(
+ r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
+ default=None)
+ video_categories = None if category is None else [category]
+ else:
+ video_categories = None
+
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
@@ -1268,11 +1206,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Decide which formats to download
try:
- mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
if not mobj:
raise ValueError('Could not find vevo ID')
- info = json.loads(mobj.group(1))
- args = info['args']
+ json_code = uppercase_escape(mobj.group(1))
+ ytplayer_config = json.loads(json_code)
+ args = ytplayer_config['args']
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted
if 'url_encoded_fmt_stream_map' not in args:
@@ -1365,12 +1304,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest
- dash_manifest_url_lst = video_info.get('dashmpd')
- if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
- self._downloader.params.get('youtube_include_dash_manifest', False)):
+ if (self._downloader.params.get('youtube_include_dash_manifest', False)):
try:
+ # The DASH manifest used needs to be the one from the original video_webpage.
+ # The one found in get_video_info seems to be using different signatures.
+ # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
+ # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
+ # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
+ if age_gate:
+ dash_manifest_url = video_info.get('dashmpd')[0]
+ else:
+ dash_manifest_url = ytplayer_config['args']['dashmpd']
+ def decrypt_sig(mobj):
+ s = mobj.group(1)
+ dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+ return '/signature/%s' % dec_s
+ dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
- dash_manifest_url_lst[0], video_id,
+ dash_manifest_url, video_id,
note=u'Downloading DASH manifest',
errnote=u'Could not download DASH manifest')
for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
@@ -1411,11 +1362,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
'title': video_title,
'thumbnail': video_thumbnail,
'description': video_description,
+ 'categories': video_categories,
'subtitles': video_subtitles,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
- 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
@@ -1442,9 +1394,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
|
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_MORE_PAGES_INDICATOR = r'data-link-type="next"'
- _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
def _real_initialize(self):
@@ -1459,11 +1411,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
- title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
- get_element_by_attribute('class', 'title ', webpage))
+ search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+ title_span = (search_title('playlist-title') or
+ search_title('title long-title') or search_title('title'))
title = clean_html(title_span)
- video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
- ids = orderedSet(re.findall(video_re, webpage))
+ video_re = r'''(?x)data-video-username=".*?".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
+ ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, title)
@@ -1483,7 +1437,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
return self.url_result(video_id, 'Youtube', video_id=video_id)
else:
- self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
if playlist_id.startswith('RD'):
# Mixes require a custom extraction process
@@ -1492,29 +1446,41 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
raise ExtractorError(u'For downloading YouTube.com top lists, use '
u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
+ url = self._TEMPLATE_URL % playlist_id
+ page = self._download_webpage(url, playlist_id)
+ more_widget_html = content_html = page
+
+ # Check if the playlist exists or is private
+ if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
+ raise ExtractorError(
+ u'The playlist doesn\'t exist or is private, use --username or '
+ '--netrc to access it.',
+ expected=True)
+
# Extract the video ids from the playlist pages
ids = []
for page_num in itertools.count(1):
- url = self._TEMPLATE_URL % (playlist_id, page_num)
- page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
- matches = re.finditer(self._VIDEO_RE, page)
+ matches = re.finditer(self._VIDEO_RE, content_html)
# We remove the duplicates and the link with index 0
# (it's not the first video of the playlist)
new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
ids.extend(new_ids)
- if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
break
- try:
- playlist_title = self._og_search_title(page)
- except RegexNotFoundError:
- self.report_warning(
- u'Playlist page is missing OpenGraph title, falling back ...',
- playlist_id)
- playlist_title = self._html_search_regex(
- r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ playlist_title = self._html_search_regex(
+ r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+ page, u'title')
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title)
@@ -1610,7 +1576,7 @@ class YoutubeChannelIE(InfoExtractor):
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@@ -1672,7 +1638,7 @@ class YoutubeUserIE(InfoExtractor):
class YoutubeSearchIE(SearchInfoExtractor):
IE_DESC = u'YouTube.com searches'
- _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
+ _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_MAX_RESULTS = 1000
IE_NAME = u'youtube:search'
_SEARCH_KEY = 'ytsearch'
@@ -1683,9 +1649,12 @@ class YoutubeSearchIE(SearchInfoExtractor):
video_ids = []
pagenum = 0
limit = n
+ PAGE_SIZE = 50
- while (50 * pagenum) < limit:
- result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
+ while (PAGE_SIZE * pagenum) < limit:
+ result_url = self._API_URL % (
+ compat_urllib_parse.quote_plus(query.encode('utf-8')),
+ (PAGE_SIZE * pagenum) + 1)
data_json = self._download_webpage(
result_url, video_id=u'query "%s"' % query,
note=u'Downloading page %s' % (pagenum + 1),
@@ -1709,12 +1678,50 @@ class YoutubeSearchIE(SearchInfoExtractor):
for video_id in video_ids]
return self.playlist_result(videos, query)
+
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = u'YouTube.com searches, newest videos first'
+
+class YoutubeSearchURLIE(InfoExtractor):
+ IE_DESC = u'YouTube.com search URLs'
+ IE_NAME = u'youtube:search_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+
+ webpage = self._download_webpage(url, query)
+ result_code = self._search_regex(
+ r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
+
+ part_codes = re.findall(
+ r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+ entries = []
+ for part_code in part_codes:
+ part_title = self._html_search_regex(
+ r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
+ part_url_snippet = self._html_search_regex(
+ r'(?s)href="([^"]+)"', part_code, 'item URL')
+ part_url = compat_urlparse.urljoin(
+ 'https://www.youtube.com/', part_url_snippet)
+ entries.append({
+ '_type': 'url',
+ 'url': part_url,
+ 'title': part_title,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': query,
+ }
+
+
class YoutubeShowIE(InfoExtractor):
IE_DESC = u'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
@@ -1758,23 +1765,25 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
feed_entries = []
paging = 0
for i in itertools.count(1):
- info = self._download_webpage(self._FEED_TEMPLATE % paging,
+ info = self._download_json(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i)
- info = json.loads(info)
- feed_html = info['feed_html']
+ feed_html = info.get('feed_html') or info.get('content_html')
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend(
self.url_result(video_id, 'Youtube', video_id=video_id)
for video_id in ids)
- if info['paging'] is None:
+ mobj = re.search(
+ r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
+ feed_html)
+ if mobj is None:
break
- paging = info['paging']
+ paging = mobj.group('paging')
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+ IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
_FEED_NAME = 'subscriptions'
_PLAYLIST_TITLE = u'Youtube Subscriptions'
@@ -1815,7 +1824,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
_VALID_URL = r'''(?x)
- (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
+ (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
(?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
'''
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 829f002..3b1ac4e 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,4 +1,5 @@
# coding: utf-8
+from __future__ import unicode_literals
import re
@@ -13,52 +14,42 @@ class ZDFIE(InfoExtractor):
_VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
_TEST = {
- u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
- u"file": u"2037704.webm",
- u"info_dict": {
- u"upload_date": u"20131127",
- u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
- u"uploader": u"spezial",
- u"title": u"ZDFspezial - Ende des Machtpokers"
+ 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
+ 'info_dict': {
+ 'id': '2037704',
+ 'ext': 'webm',
+ 'title': 'ZDFspezial - Ende des Machtpokers',
+ 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".',
+ 'duration': 1022,
+ 'uploader': 'spezial',
+ 'uploader_id': '225948',
+ 'upload_date': '20131127',
},
- u"skip": u"Videos on ZDF.de are depublicised in short order",
+ 'skip': 'Videos on ZDF.de are depublicised in short order',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
- xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
doc = self._download_xml(
xml_url, video_id,
- note=u'Downloading video info',
- errnote=u'Failed to download video info')
+ note='Downloading video info',
+ errnote='Failed to download video info')
title = doc.find('.//information/title').text
description = doc.find('.//information/detail').text
+ duration = int(doc.find('.//details/lengthSec').text)
uploader_node = doc.find('.//details/originChannelTitle')
uploader = None if uploader_node is None else uploader_node.text
- duration_str = doc.find('.//details/length').text
- duration_m = re.match(r'''(?x)^
- (?P<hours>[0-9]{2})
- :(?P<minutes>[0-9]{2})
- :(?P<seconds>[0-9]{2})
- (?:\.(?P<ms>[0-9]+)?)
- ''', duration_str)
- duration = (
- (
- (int(duration_m.group('hours')) * 60 * 60) +
- (int(duration_m.group('minutes')) * 60) +
- int(duration_m.group('seconds'))
- )
- if duration_m
- else None
- )
+ uploader_id_node = doc.find('.//details/originChannelId')
+ uploader_id = None if uploader_id_node is None else uploader_id_node.text
upload_date = unified_strdate(doc.find('.//details/airtime').text)
def xml_to_format(fnode):
video_url = fnode.find('url').text
- is_available = u'http://www.metafilegenerator' not in video_url
+ is_available = 'http://www.metafilegenerator' not in video_url
format_id = fnode.attrib['basetype']
format_m = re.match(r'''(?x)
@@ -71,22 +62,28 @@ class ZDFIE(InfoExtractor):
quality = fnode.find('./quality').text
abr = int(fnode.find('./audioBitrate').text) // 1000
- vbr = int(fnode.find('./videoBitrate').text) // 1000
+ vbr_node = fnode.find('./videoBitrate')
+ vbr = None if vbr_node is None else int(vbr_node.text) // 1000
- format_note = u''
+ width_node = fnode.find('./width')
+ width = None if width_node is None else int_or_none(width_node.text)
+ height_node = fnode.find('./height')
+ height = None if height_node is None else int_or_none(height_node.text)
+
+ format_note = ''
if not format_note:
format_note = None
return {
- 'format_id': format_id + u'-' + quality,
+ 'format_id': format_id + '-' + quality,
'url': video_url,
'ext': ext,
'acodec': format_m.group('acodec'),
'vcodec': format_m.group('vcodec'),
'abr': abr,
'vbr': vbr,
- 'width': int_or_none(fnode.find('./width').text),
- 'height': int_or_none(fnode.find('./height').text),
+ 'width': width,
+ 'height': height,
'filesize': int_or_none(fnode.find('./filesize').text),
'format_note': format_note,
'protocol': proto,
@@ -103,9 +100,10 @@ class ZDFIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'formats': formats,
'description': description,
- 'uploader': uploader,
'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
'upload_date': upload_date,
- }
+ 'formats': formats,
+ } \ No newline at end of file
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
new file mode 100644
index 0000000..449482d
--- /dev/null
+++ b/youtube_dl/jsinterp.py
@@ -0,0 +1,116 @@
+from __future__ import unicode_literals
+
+import re
+
+from .utils import (
+ ExtractorError,
+)
+
+
+class JSInterpreter(object):
+ def __init__(self, code):
+ self.code = code
+ self._functions = {}
+
+ def interpret_statement(self, stmt, local_vars, allow_recursion=20):
+ if allow_recursion < 0:
+ raise ExtractorError('Recursion limit reached')
+
+ if stmt.startswith('var '):
+ stmt = stmt[len('var '):]
+ ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
+ r'=(?P<expr>.*)$', stmt)
+ if ass_m:
+ if ass_m.groupdict().get('index'):
+ def assign(val):
+ lvar = local_vars[ass_m.group('out')]
+ idx = self.interpret_expression(
+ ass_m.group('index'), local_vars, allow_recursion)
+ assert isinstance(idx, int)
+ lvar[idx] = val
+ return val
+ expr = ass_m.group('expr')
+ else:
+ def assign(val):
+ local_vars[ass_m.group('out')] = val
+ return val
+ expr = ass_m.group('expr')
+ elif stmt.startswith('return '):
+ assign = lambda v: v
+ expr = stmt[len('return '):]
+ else:
+ raise ExtractorError(
+ 'Cannot determine left side of statement in %r' % stmt)
+
+ v = self.interpret_expression(expr, local_vars, allow_recursion)
+ return assign(v)
+
+ def interpret_expression(self, expr, local_vars, allow_recursion):
+ if expr.isdigit():
+ return int(expr)
+
+ if expr.isalpha():
+ return local_vars[expr]
+
+ m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
+ if m:
+ member = m.group('member')
+ val = local_vars[m.group('in')]
+ if member == 'split("")':
+ return list(val)
+ if member == 'join("")':
+ return u''.join(val)
+ if member == 'length':
+ return len(val)
+ if member == 'reverse()':
+ return val[::-1]
+ slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
+ if slice_m:
+ idx = self.interpret_expression(
+ slice_m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx:]
+
+ m = re.match(
+ r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
+ if m:
+ val = local_vars[m.group('in')]
+ idx = self.interpret_expression(
+ m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx]
+
+ m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
+ if m:
+ a = self.interpret_expression(
+ m.group('a'), local_vars, allow_recursion)
+ b = self.interpret_expression(
+ m.group('b'), local_vars, allow_recursion)
+ return a % b
+
+ m = re.match(
+ r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
+ if m:
+ fname = m.group('func')
+ if fname not in self._functions:
+ self._functions[fname] = self.extract_function(fname)
+ argvals = [int(v) if v.isdigit() else local_vars[v]
+ for v in m.group('args').split(',')]
+ return self._functions[fname](argvals)
+ raise ExtractorError('Unsupported JS expression %r' % expr)
+
+ def extract_function(self, funcname):
+ func_m = re.search(
+ (r'(?:function %s|%s\s*=\s*function)' % (
+ re.escape(funcname), re.escape(funcname))) +
+ r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+ self.code)
+ if func_m is None:
+ raise ExtractorError('Could not find JS function %r' % funcname)
+ argnames = func_m.group('args').split(',')
+
+ def resf(args):
+ local_vars = dict(zip(argnames, args))
+ for stmt in func_m.group('code').split(';'):
+ res = self.interpret_statement(stmt, local_vars)
+ return res
+ return resf
+
diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py
index 7f19f71..08e6ddd 100644
--- a/youtube_dl/postprocessor/__init__.py
+++ b/youtube_dl/postprocessor/__init__.py
@@ -1,5 +1,7 @@
+from .atomicparsley import AtomicParsleyPP
from .ffmpeg import (
+ FFmpegAudioFixPP,
FFmpegMergerPP,
FFmpegMetadataPP,
FFmpegVideoConvertor,
@@ -9,6 +11,8 @@ from .ffmpeg import (
from .xattrpp import XAttrMetadataPP
__all__ = [
+ 'AtomicParsleyPP',
+ 'FFmpegAudioFixPP',
'FFmpegMergerPP',
'FFmpegMetadataPP',
'FFmpegVideoConvertor',
diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py
new file mode 100644
index 0000000..765b2d9
--- /dev/null
+++ b/youtube_dl/postprocessor/atomicparsley.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+
+import os
+import subprocess
+
+from .common import PostProcessor
+
+from ..utils import (
+ check_executable,
+ compat_urlretrieve,
+ encodeFilename,
+ PostProcessingError,
+ prepend_extension,
+ shell_quote
+)
+
+
+class AtomicParsleyPPError(PostProcessingError):
+ pass
+
+
+class AtomicParsleyPP(PostProcessor):
+ def run(self, info):
+ if not check_executable('AtomicParsley', ['-v']):
+ raise AtomicParsleyPPError('AtomicParsley was not found. Please install.')
+
+ filename = info['filepath']
+ temp_filename = prepend_extension(filename, 'temp')
+ temp_thumbnail = prepend_extension(filename, 'thumb')
+
+ if not info.get('thumbnail'):
+ raise AtomicParsleyPPError('Thumbnail was not found. Nothing to do.')
+
+ compat_urlretrieve(info['thumbnail'], temp_thumbnail)
+
+ cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename]
+
+ self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
+
+ if self._downloader.params.get('verbose', False):
+ self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd))
+
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = p.communicate()
+
+ if p.returncode != 0:
+ msg = stderr.decode('utf-8', 'replace').strip()
+ raise AtomicParsleyPPError(msg)
+
+ os.remove(encodeFilename(filename))
+ os.remove(encodeFilename(temp_thumbnail))
+ os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+ return True, info
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index c22f2cd..45328ed 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -9,6 +9,7 @@ from .common import AudioConversionError, PostProcessor
from ..utils import (
check_executable,
compat_subprocess_get_DEVNULL,
+ encodeArgument,
encodeFilename,
PostProcessingError,
prepend_extension,
@@ -48,13 +49,13 @@ class FFmpegPostProcessor(PostProcessor):
for path in input_paths:
files_cmd.extend(['-i', encodeFilename(path, True)])
cmd = ([self._get_executable(), '-y'] + files_cmd
- + opts +
+ + [encodeArgument(o) for o in opts] +
[encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
if self._downloader.params.get('verbose', False):
self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- stdout,stderr = p.communicate()
+ stdout, stderr = p.communicate()
if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace')
msg = stderr.strip().split('\n')[-1]
@@ -464,7 +465,11 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
- options = ['-c', 'copy']
+ if info['ext'] == u'm4a':
+ options = ['-vn', '-acodec', 'copy']
+ else:
+ options = ['-c', 'copy']
+
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
@@ -483,3 +488,17 @@ class FFmpegMergerPP(FFmpegPostProcessor):
self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)
return True, info
+
+class FFmpegAudioFixPP(FFmpegPostProcessor):
+ def run(self, info):
+ filename = info['filepath']
+ temp_filename = prepend_extension(filename, 'temp')
+
+ options = ['-vn', '-acodec', 'copy']
+ self._downloader.to_screen(u'[ffmpeg] Fixing audio file "%s"' % filename)
+ self.run_ffmpeg(filename, temp_filename, options)
+
+ os.remove(encodeFilename(filename))
+ os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+ return True, info
diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py
index 1897924..f694094 100644
--- a/youtube_dl/postprocessor/xattrpp.py
+++ b/youtube_dl/postprocessor/xattrpp.py
@@ -6,6 +6,7 @@ from .common import PostProcessor
from ..utils import (
check_executable,
hyphenate_date,
+ subprocess_check_output
)
@@ -57,7 +58,7 @@ class XAttrMetadataPP(PostProcessor):
elif user_has_xattr:
cmd = ['xattr', '-w', key, value, path]
- subprocess.check_output(cmd)
+ subprocess_check_output(cmd)
else:
# On Unix, and can't find pyxattr, setfattr, or xattr.
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 057cd20..b97e62a 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1,10 +1,14 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
+import calendar
+import codecs
+import contextlib
import ctypes
import datetime
import email.utils
import errno
+import getpass
import gzip
import itertools
import io
@@ -21,6 +25,7 @@ import struct
import subprocess
import sys
import traceback
+import xml.etree.ElementTree
import zlib
try:
@@ -174,6 +179,11 @@ try:
except NameError:
compat_chr = chr
+try:
+ from xml.etree.ElementTree import ParseError as compat_xml_parse_error
+except ImportError: # Python 2.6
+ from xml.parsers.expat import ExpatError as compat_xml_parse_error
+
def compat_ord(c):
if type(c) is int: return c
else: return ord(c)
@@ -493,13 +503,13 @@ def orderedSet(iterable):
res.append(el)
return res
+
def unescapeHTML(s):
- """
- @param s a string
- """
- assert type(s) == type(u'')
+ if s is None:
+ return None
+ assert type(s) == compat_str
- result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
+ result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
return result
@@ -531,6 +541,15 @@ def encodeFilename(s, for_subprocess=False):
return s.encode(encoding, 'ignore')
+def encodeArgument(s):
+ if not isinstance(s, compat_str):
+ # Legacy code that uses byte strings
+ # Uncomment the following line after fixing all post processors
+ #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
+ s = s.decode('ascii')
+ return encodeFilename(s, True)
+
+
def decodeOption(optval):
if optval is None:
return optval
@@ -585,13 +604,15 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
class ExtractorError(Exception):
"""Error during info extraction."""
- def __init__(self, msg, tb=None, expected=False, cause=None):
+ def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
"""
if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
expected = True
+ if video_id is not None:
+ msg = video_id + ': ' + msg
if not expected:
msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
super(ExtractorError, self).__init__(msg)
@@ -599,6 +620,7 @@ class ExtractorError(Exception):
self.traceback = tb
self.exc_info = sys.exc_info() # preserve original exception
self.cause = cause
+ self.video_id = video_id
def format_traceback(self):
if self.traceback is None:
@@ -753,8 +775,37 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_response = http_response
+def parse_iso8601(date_str):
+ """ Return a UNIX timestamp from the given date """
+
+ if date_str is None:
+ return None
+
+ m = re.search(
+ r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
+ date_str)
+ if not m:
+ timezone = datetime.timedelta()
+ else:
+ date_str = date_str[:-len(m.group(0))]
+ if not m.group('sign'):
+ timezone = datetime.timedelta()
+ else:
+ sign = 1 if m.group('sign') == '+' else -1
+ timezone = datetime.timedelta(
+ hours=sign * int(m.group('hours')),
+ minutes=sign * int(m.group('minutes')))
+
+ dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
+ return calendar.timegm(dt.timetuple())
+
+
def unified_strdate(date_str):
"""Return a string with the date in the format YYYYMMDD"""
+
+ if date_str is None:
+ return None
+
upload_date = None
#Replace commas
date_str = date_str.replace(',', ' ')
@@ -766,14 +817,17 @@ def unified_strdate(date_str):
'%B %d %Y',
'%b %d %Y',
'%Y-%m-%d',
+ '%d.%m.%Y',
'%d/%m/%Y',
'%Y/%m/%d %H:%M:%S',
'%Y-%m-%d %H:%M:%S',
'%d.%m.%Y %H:%M',
+ '%d.%m.%Y %H.%M',
'%Y-%m-%dT%H:%M:%SZ',
'%Y-%m-%dT%H:%M:%S.%fZ',
'%Y-%m-%dT%H:%M:%S.%f0Z',
'%Y-%m-%dT%H:%M:%S',
+ '%Y-%m-%dT%H:%M:%S.%f',
'%Y-%m-%dT%H:%M',
]
for expression in format_expressions:
@@ -869,25 +923,97 @@ def platform_name():
return res
-def write_string(s, out=None):
+def _windows_write_string(s, out):
+ """ Returns True if the string was written using special methods,
+ False if it has yet to be written out."""
+ # Adapted from http://stackoverflow.com/a/3259271/35070
+
+ import ctypes
+ import ctypes.wintypes
+
+ WIN_OUTPUT_IDS = {
+ 1: -11,
+ 2: -12,
+ }
+
+ try:
+ fileno = out.fileno()
+ except AttributeError:
+ # If the output stream doesn't have a fileno, it's virtual
+ return False
+ if fileno not in WIN_OUTPUT_IDS:
+ return False
+
+ GetStdHandle = ctypes.WINFUNCTYPE(
+ ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
+ ("GetStdHandle", ctypes.windll.kernel32))
+ h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
+
+ WriteConsoleW = ctypes.WINFUNCTYPE(
+ ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
+ ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
+ ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
+ written = ctypes.wintypes.DWORD(0)
+
+ GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
+ FILE_TYPE_CHAR = 0x0002
+ FILE_TYPE_REMOTE = 0x8000
+ GetConsoleMode = ctypes.WINFUNCTYPE(
+ ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
+ ctypes.POINTER(ctypes.wintypes.DWORD))(
+ ("GetConsoleMode", ctypes.windll.kernel32))
+ INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
+
+ def not_a_console(handle):
+ if handle == INVALID_HANDLE_VALUE or handle is None:
+ return True
+ return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
+ or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
+
+ if not_a_console(h):
+ return False
+
+ def next_nonbmp_pos(s):
+ try:
+ return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
+ except StopIteration:
+ return len(s)
+
+ while s:
+ count = min(next_nonbmp_pos(s), 1024)
+
+ ret = WriteConsoleW(
+ h, s, count if count else 2, ctypes.byref(written), None)
+ if ret == 0:
+ raise OSError('Failed to write string')
+ if not count: # We just wrote a non-BMP character
+ assert written.value == 2
+ s = s[1:]
+ else:
+ assert written.value > 0
+ s = s[written.value:]
+ return True
+
+
+def write_string(s, out=None, encoding=None):
if out is None:
out = sys.stderr
assert type(s) == compat_str
+ if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
+ if _windows_write_string(s, out):
+ return
+
if ('b' in getattr(out, 'mode', '') or
sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
- s = s.encode(preferredencoding(), 'ignore')
- try:
+ byt = s.encode(encoding or preferredencoding(), 'ignore')
+ out.write(byt)
+ elif hasattr(out, 'buffer'):
+ enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
+ byt = s.encode(enc, 'ignore')
+ out.buffer.write(byt)
+ else:
out.write(s)
- except UnicodeEncodeError:
- # In Windows shells, this can fail even when the codec is just charmap!?
- # See https://wiki.python.org/moin/PrintFails#Issue
- if sys.platform == 'win32' and hasattr(out, 'encoding'):
- s = s.encode(out.encoding, 'ignore').decode(out.encoding)
- out.write(s)
- else:
- raise
-
out.flush()
@@ -1111,11 +1237,11 @@ def setproctitle(title):
libc = ctypes.cdll.LoadLibrary("libc.so.6")
except OSError:
return
- title = title
- buf = ctypes.create_string_buffer(len(title) + 1)
- buf.value = title.encode('utf-8')
+ title_bytes = title.encode('utf-8')
+ buf = ctypes.create_string_buffer(len(title_bytes))
+ buf.value = title_bytes
try:
- libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
+ libc.prctl(15, buf, 0, 0, 0)
except AttributeError:
return # Strange libc, just skip this
@@ -1136,8 +1262,15 @@ class HEADRequest(compat_urllib_request.Request):
return "HEAD"
-def int_or_none(v, scale=1):
- return v if v is None else (int(v) // scale)
+def int_or_none(v, scale=1, default=None, get_attr=None):
+ if get_attr:
+ if v is not None:
+ v = getattr(v, get_attr, None)
+ return default if v is None else (int(v) // scale)
+
+
+def float_or_none(v, scale=1, default=None):
+ return default if v is None else (float(v) / scale)
def parse_duration(s):
@@ -1145,7 +1278,7 @@ def parse_duration(s):
return None
m = re.match(
- r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
+ r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
if not m:
return None
res = int(m.group('secs'))
@@ -1219,9 +1352,11 @@ class PagedList(object):
def uppercase_escape(s):
+ unicode_escape = codecs.getdecoder('unicode_escape')
return re.sub(
- r'\\U([0-9a-fA-F]{8})',
- lambda m: compat_chr(int(m.group(1), base=16)), s)
+ r'\\U[0-9a-fA-F]{8}',
+ lambda m: unicode_escape(m.group(0))[0],
+ s)
try:
struct.pack(u'!I', 0)
@@ -1239,3 +1374,80 @@ except TypeError:
else:
struct_pack = struct.pack
struct_unpack = struct.unpack
+
+
+def read_batch_urls(batch_fd):
+ def fixup(url):
+ if not isinstance(url, compat_str):
+ url = url.decode('utf-8', 'replace')
+ BOM_UTF8 = u'\xef\xbb\xbf'
+ if url.startswith(BOM_UTF8):
+ url = url[len(BOM_UTF8):]
+ url = url.strip()
+ if url.startswith(('#', ';', ']')):
+ return False
+ return url
+
+ with contextlib.closing(batch_fd) as fd:
+ return [url for url in map(fixup, fd) if url]
+
+
+def urlencode_postdata(*args, **kargs):
+ return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+
+
+def parse_xml(s):
+ class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
+ def doctype(self, name, pubid, system):
+ pass # Ignore doctypes
+
+ parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
+ kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
+ return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
+
+
+if sys.version_info < (3, 0) and sys.platform == 'win32':
+ def compat_getpass(prompt, *args, **kwargs):
+ if isinstance(prompt, compat_str):
+ prompt = prompt.encode(preferredencoding())
+ return getpass.getpass(prompt, *args, **kwargs)
+else:
+ compat_getpass = getpass.getpass
+
+
+US_RATINGS = {
+ 'G': 0,
+ 'PG': 10,
+ 'PG-13': 13,
+ 'R': 16,
+ 'NC': 18,
+}
+
+
+def strip_jsonp(code):
+ return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
+
+
+def qualities(quality_ids):
+ """ Get a numeric quality value out of a list of possible values """
+ def q(qid):
+ try:
+ return quality_ids.index(qid)
+ except ValueError:
+ return -1
+ return q
+
+
+DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
+
+try:
+ subprocess_check_output = subprocess.check_output
+except AttributeError:
+ def subprocess_check_output(*args, **kwargs):
+ assert 'input' not in kwargs
+ p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
+ output, _ = p.communicate()
+ ret = p.poll()
+ if ret:
+ raise subprocess.CalledProcessError(ret, p.args, output=output)
+ return output
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index a9fead9..6fe7c7b 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
-__version__ = '2014.02.17'
+__version__ = '2014.06.07'