aboutsummaryrefslogtreecommitdiffstats
path: root/youtube_dl
diff options
context:
space:
mode:
authorRogério Brito <rbrito@ime.usp.br>2016-06-24 22:51:49 -0300
committerRogério Brito <rbrito@ime.usp.br>2016-06-24 22:51:49 -0300
commit9dc487f48b50767cf540fa36c3de2c386fd74c04 (patch)
treefe4832e21f6d0a9111aed7f141fcfd957c1481c5 /youtube_dl
parent9f2b33881274af98a9145c533a1d295fad71521a (diff)
downloadyoutube-dl-9dc487f48b50767cf540fa36c3de2c386fd74c04.zip
youtube-dl-9dc487f48b50767cf540fa36c3de2c386fd74c04.tar.gz
youtube-dl-9dc487f48b50767cf540fa36c3de2c386fd74c04.tar.bz2
Imported Upstream version 2016.06.25
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py132
-rw-r--r--youtube_dl/__init__.py37
-rw-r--r--youtube_dl/compat.py2365
-rw-r--r--youtube_dl/downloader/__init__.py24
-rw-r--r--youtube_dl/downloader/common.py11
-rw-r--r--youtube_dl/downloader/dash.py42
-rw-r--r--youtube_dl/downloader/external.py137
-rw-r--r--youtube_dl/downloader/f4m.py62
-rw-r--r--youtube_dl/downloader/fragment.py12
-rw-r--r--youtube_dl/downloader/hls.py184
-rw-r--r--youtube_dl/downloader/rtsp.py2
-rw-r--r--youtube_dl/extractor/__init__.py984
-rw-r--r--youtube_dl/extractor/abc.py2
-rw-r--r--youtube_dl/extractor/abc7news.py1
-rw-r--r--youtube_dl/extractor/abcnews.py135
-rw-r--r--youtube_dl/extractor/acast.py38
-rw-r--r--youtube_dl/extractor/addanime.py6
-rw-r--r--youtube_dl/extractor/adobetv.py5
-rw-r--r--youtube_dl/extractor/aenetworks.py39
-rw-r--r--youtube_dl/extractor/afreecatv.py133
-rw-r--r--youtube_dl/extractor/aftonbladet.py6
-rw-r--r--youtube_dl/extractor/aljazeera.py20
-rw-r--r--youtube_dl/extractor/amp.py8
-rw-r--r--youtube_dl/extractor/animeondemand.py168
-rw-r--r--youtube_dl/extractor/anvato.py224
-rw-r--r--youtube_dl/extractor/aol.py145
-rw-r--r--youtube_dl/extractor/appletrailers.py53
-rw-r--r--youtube_dl/extractor/ard.py41
-rw-r--r--youtube_dl/extractor/arte.py279
-rw-r--r--youtube_dl/extractor/atresplayer.py12
-rw-r--r--youtube_dl/extractor/audimedia.py19
-rw-r--r--youtube_dl/extractor/audioboom.py66
-rw-r--r--youtube_dl/extractor/audiomack.py15
-rw-r--r--youtube_dl/extractor/azubu.py22
-rw-r--r--youtube_dl/extractor/baidu.py2
-rw-r--r--youtube_dl/extractor/bambuser.py10
-rw-r--r--youtube_dl/extractor/bandcamp.py8
-rw-r--r--youtube_dl/extractor/bbc.py116
-rw-r--r--youtube_dl/extractor/beeg.py33
-rw-r--r--youtube_dl/extractor/behindkink.py2
-rw-r--r--youtube_dl/extractor/bet.py93
-rw-r--r--youtube_dl/extractor/bilibili.py205
-rw-r--r--youtube_dl/extractor/biobiochiletv.py86
-rw-r--r--youtube_dl/extractor/biqle.py39
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/bloomberg.py3
-rw-r--r--youtube_dl/extractor/bokecc.py60
-rw-r--r--youtube_dl/extractor/bpb.py2
-rw-r--r--youtube_dl/extractor/br.py9
-rw-r--r--youtube_dl/extractor/bravotv.py31
-rw-r--r--youtube_dl/extractor/breakcom.py2
-rw-r--r--youtube_dl/extractor/brightcove.py191
-rw-r--r--youtube_dl/extractor/byutv.py4
-rw-r--r--youtube_dl/extractor/c56.py22
-rw-r--r--youtube_dl/extractor/camdemy.py8
-rw-r--r--youtube_dl/extractor/camwithher.py87
-rw-r--r--youtube_dl/extractor/canalplus.py80
-rw-r--r--youtube_dl/extractor/carambatv.py88
-rw-r--r--youtube_dl/extractor/cbc.py96
-rw-r--r--youtube_dl/extractor/cbs.py81
-rw-r--r--youtube_dl/extractor/cbsinteractive.py (renamed from youtube_dl/extractor/cnet.py)60
-rw-r--r--youtube_dl/extractor/cbslocal.py84
-rw-r--r--youtube_dl/extractor/cbsnews.py50
-rw-r--r--youtube_dl/extractor/cbssports.py40
-rw-r--r--youtube_dl/extractor/ccc.py111
-rwxr-xr-xyoutube_dl/extractor/cda.py98
-rw-r--r--youtube_dl/extractor/ceskatelevize.py47
-rw-r--r--youtube_dl/extractor/channel9.py125
-rw-r--r--youtube_dl/extractor/chaturbate.py1
-rw-r--r--youtube_dl/extractor/cinemassacre.py107
-rw-r--r--youtube_dl/extractor/cliphunter.py2
-rw-r--r--youtube_dl/extractor/cliprs.py90
-rw-r--r--youtube_dl/extractor/clipsyndicate.py2
-rw-r--r--youtube_dl/extractor/closertotruth.py92
-rw-r--r--youtube_dl/extractor/cloudy.py8
-rw-r--r--youtube_dl/extractor/clubic.py2
-rw-r--r--youtube_dl/extractor/cnbc.py36
-rw-r--r--youtube_dl/extractor/collegehumor.py101
-rw-r--r--youtube_dl/extractor/comcarcoff.py13
-rw-r--r--youtube_dl/extractor/comedycentral.py11
-rw-r--r--youtube_dl/extractor/common.py303
-rw-r--r--youtube_dl/extractor/commonprotocols.py36
-rw-r--r--youtube_dl/extractor/condenast.py6
-rw-r--r--youtube_dl/extractor/coub.py143
-rw-r--r--youtube_dl/extractor/crunchyroll.py56
-rw-r--r--youtube_dl/extractor/cspan.py2
-rw-r--r--youtube_dl/extractor/ctsnews.py2
-rw-r--r--youtube_dl/extractor/cwtv.py6
-rw-r--r--youtube_dl/extractor/dailymail.py61
-rw-r--r--youtube_dl/extractor/daum.py6
-rw-r--r--youtube_dl/extractor/dcn.py56
-rw-r--r--youtube_dl/extractor/dctp.py2
-rw-r--r--youtube_dl/extractor/deezer.py4
-rw-r--r--youtube_dl/extractor/defense.py2
-rw-r--r--youtube_dl/extractor/democracynow.py45
-rw-r--r--youtube_dl/extractor/dfb.py34
-rw-r--r--youtube_dl/extractor/discovery.py45
-rw-r--r--youtube_dl/extractor/dispeak.py114
-rw-r--r--youtube_dl/extractor/douyutv.py57
-rw-r--r--youtube_dl/extractor/dplay.py154
-rw-r--r--youtube_dl/extractor/dramafever.py4
-rw-r--r--youtube_dl/extractor/dreisat.py2
-rw-r--r--youtube_dl/extractor/dump.py39
-rw-r--r--youtube_dl/extractor/dvtv.py2
-rw-r--r--youtube_dl/extractor/dw.py108
-rw-r--r--youtube_dl/extractor/eagleplatform.py35
-rw-r--r--youtube_dl/extractor/ebaumsworld.py4
-rw-r--r--youtube_dl/extractor/echomsk.py2
-rw-r--r--youtube_dl/extractor/elpais.py31
-rw-r--r--youtube_dl/extractor/engadget.py25
-rw-r--r--youtube_dl/extractor/eporner.py12
-rw-r--r--youtube_dl/extractor/eroprofile.py4
-rw-r--r--youtube_dl/extractor/espn.py6
-rw-r--r--youtube_dl/extractor/exfm.py2
-rw-r--r--youtube_dl/extractor/extractors.py1064
-rw-r--r--youtube_dl/extractor/eyedotv.py64
-rw-r--r--youtube_dl/extractor/facebook.py111
-rw-r--r--youtube_dl/extractor/fc2.py7
-rw-r--r--youtube_dl/extractor/fczenit.py33
-rw-r--r--youtube_dl/extractor/firstpost.py2
-rw-r--r--youtube_dl/extractor/firsttv.py139
-rw-r--r--youtube_dl/extractor/fivemin.py53
-rw-r--r--youtube_dl/extractor/fktv.py2
-rw-r--r--youtube_dl/extractor/flickr.py30
-rw-r--r--youtube_dl/extractor/footyroom.py2
-rw-r--r--youtube_dl/extractor/formula1.py26
-rw-r--r--youtube_dl/extractor/fox.py3
-rw-r--r--youtube_dl/extractor/foxgay.py2
-rw-r--r--youtube_dl/extractor/foxnews.py12
-rw-r--r--youtube_dl/extractor/foxsports.py20
-rw-r--r--youtube_dl/extractor/franceinter.py2
-rw-r--r--youtube_dl/extractor/francetv.py56
-rw-r--r--youtube_dl/extractor/freespeech.py2
-rw-r--r--youtube_dl/extractor/freevideo.py2
-rw-r--r--youtube_dl/extractor/funimation.py53
-rw-r--r--youtube_dl/extractor/funnyordie.py4
-rw-r--r--youtube_dl/extractor/gameinformer.py31
-rw-r--r--youtube_dl/extractor/gamekings.py2
-rw-r--r--youtube_dl/extractor/gamespot.py93
-rw-r--r--youtube_dl/extractor/gamestar.py2
-rw-r--r--youtube_dl/extractor/gametrailers.py62
-rw-r--r--youtube_dl/extractor/gazeta.py12
-rw-r--r--youtube_dl/extractor/gdcvault.py110
-rw-r--r--youtube_dl/extractor/generic.py410
-rw-r--r--youtube_dl/extractor/glide.py31
-rw-r--r--youtube_dl/extractor/godtv.py66
-rw-r--r--youtube_dl/extractor/googledrive.py12
-rw-r--r--youtube_dl/extractor/goshgay.py6
-rw-r--r--youtube_dl/extractor/gputechconf.py36
-rw-r--r--youtube_dl/extractor/groupon.py35
-rw-r--r--youtube_dl/extractor/hbo.py122
-rw-r--r--youtube_dl/extractor/hearthisat.py40
-rw-r--r--youtube_dl/extractor/hotnewhiphop.py6
-rw-r--r--youtube_dl/extractor/howcast.py4
-rw-r--r--youtube_dl/extractor/howstuffworks.py20
-rw-r--r--youtube_dl/extractor/huffpost.py36
-rw-r--r--youtube_dl/extractor/hypem.py6
-rw-r--r--youtube_dl/extractor/imdb.py47
-rw-r--r--youtube_dl/extractor/indavideo.py5
-rw-r--r--youtube_dl/extractor/infoq.py31
-rw-r--r--youtube_dl/extractor/instagram.py101
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py118
-rw-r--r--youtube_dl/extractor/iprima.py44
-rw-r--r--youtube_dl/extractor/iqiyi.py85
-rw-r--r--youtube_dl/extractor/ivideon.py4
-rw-r--r--youtube_dl/extractor/izlesene.py7
-rw-r--r--youtube_dl/extractor/jadorecettepub.py47
-rw-r--r--youtube_dl/extractor/jeuxvideo.py4
-rw-r--r--youtube_dl/extractor/jwplatform.py121
-rw-r--r--youtube_dl/extractor/kaltura.py71
-rw-r--r--youtube_dl/extractor/karaoketv.py58
-rw-r--r--youtube_dl/extractor/karrierevideos.py7
-rw-r--r--youtube_dl/extractor/khanacademy.py4
-rw-r--r--youtube_dl/extractor/kontrtube.py2
-rw-r--r--youtube_dl/extractor/ku6.py2
-rw-r--r--youtube_dl/extractor/kusi.py99
-rw-r--r--youtube_dl/extractor/kuwo.py131
-rw-r--r--youtube_dl/extractor/laola1tv.py42
-rw-r--r--youtube_dl/extractor/learnr.py33
-rw-r--r--youtube_dl/extractor/lecture2go.py17
-rw-r--r--youtube_dl/extractor/leeco.py (renamed from youtube_dl/extractor/letv.py)118
-rw-r--r--youtube_dl/extractor/libraryofcongress.py143
-rw-r--r--youtube_dl/extractor/lifenews.py167
-rw-r--r--youtube_dl/extractor/limelight.py68
-rw-r--r--youtube_dl/extractor/litv.py137
-rw-r--r--youtube_dl/extractor/liveleak.py17
-rw-r--r--youtube_dl/extractor/livestream.py15
-rw-r--r--youtube_dl/extractor/localnews8.py47
-rw-r--r--youtube_dl/extractor/lrt.py1
-rw-r--r--youtube_dl/extractor/lynda.py153
-rw-r--r--youtube_dl/extractor/m6.py2
-rw-r--r--youtube_dl/extractor/mailru.py6
-rw-r--r--youtube_dl/extractor/makerschannel.py40
-rw-r--r--youtube_dl/extractor/malemotion.py46
-rw-r--r--youtube_dl/extractor/matchtv.py26
-rw-r--r--youtube_dl/extractor/mdr.py13
-rw-r--r--youtube_dl/extractor/metacafe.py9
-rw-r--r--youtube_dl/extractor/metacritic.py14
-rw-r--r--youtube_dl/extractor/mgtv.py64
-rw-r--r--youtube_dl/extractor/microsoftvirtualacademy.py192
-rw-r--r--youtube_dl/extractor/minhateca.py4
-rw-r--r--youtube_dl/extractor/ministrygrid.py34
-rw-r--r--youtube_dl/extractor/minoto.py56
-rw-r--r--youtube_dl/extractor/mit.py4
-rw-r--r--youtube_dl/extractor/mitele.py55
-rw-r--r--youtube_dl/extractor/mixcloud.py267
-rw-r--r--youtube_dl/extractor/mnet.py81
-rw-r--r--youtube_dl/extractor/moevideo.py4
-rw-r--r--youtube_dl/extractor/moniker.py4
-rw-r--r--youtube_dl/extractor/mooshare.py110
-rw-r--r--youtube_dl/extractor/motherless.py99
-rw-r--r--youtube_dl/extractor/motorsport.py2
-rw-r--r--youtube_dl/extractor/movieclips.py43
-rw-r--r--youtube_dl/extractor/mtv.py21
-rw-r--r--youtube_dl/extractor/musicplayon.py59
-rw-r--r--youtube_dl/extractor/muzu.py65
-rw-r--r--youtube_dl/extractor/mwave.py28
-rw-r--r--youtube_dl/extractor/myspace.py80
-rw-r--r--youtube_dl/extractor/myspass.py2
-rw-r--r--youtube_dl/extractor/myvideo.py6
-rw-r--r--youtube_dl/extractor/myvidster.py2
-rw-r--r--youtube_dl/extractor/nationalgeographic.py89
-rw-r--r--youtube_dl/extractor/naver.py6
-rw-r--r--youtube_dl/extractor/nba.py101
-rw-r--r--youtube_dl/extractor/nbc.py222
-rw-r--r--youtube_dl/extractor/ndtv.py40
-rw-r--r--youtube_dl/extractor/nerdist.py80
-rw-r--r--youtube_dl/extractor/neteasemusic.py22
-rw-r--r--youtube_dl/extractor/newgrounds.py21
-rw-r--r--youtube_dl/extractor/newstube.py43
-rw-r--r--youtube_dl/extractor/nextmedia.py6
-rw-r--r--youtube_dl/extractor/nextmovie.py4
-rw-r--r--youtube_dl/extractor/nfb.py113
-rw-r--r--youtube_dl/extractor/nhl.py104
-rw-r--r--youtube_dl/extractor/nick.py28
-rw-r--r--youtube_dl/extractor/niconico.py8
-rw-r--r--youtube_dl/extractor/noco.py6
-rw-r--r--youtube_dl/extractor/normalboots.py20
-rw-r--r--youtube_dl/extractor/nova.py2
-rw-r--r--youtube_dl/extractor/novamov.py52
-rw-r--r--youtube_dl/extractor/nowness.py11
-rw-r--r--youtube_dl/extractor/noz.py46
-rw-r--r--youtube_dl/extractor/npr.py6
-rw-r--r--youtube_dl/extractor/nrk.py449
-rw-r--r--youtube_dl/extractor/ntvru.py2
-rw-r--r--youtube_dl/extractor/nuvid.py44
-rw-r--r--youtube_dl/extractor/nytimes.py9
-rw-r--r--youtube_dl/extractor/odnoklassniki.py40
-rw-r--r--youtube_dl/extractor/once.py42
-rw-r--r--youtube_dl/extractor/onionstudios.py21
-rw-r--r--youtube_dl/extractor/ooyala.py122
-rw-r--r--youtube_dl/extractor/openload.py130
-rw-r--r--youtube_dl/extractor/ora.py9
-rw-r--r--youtube_dl/extractor/orf.py7
-rw-r--r--youtube_dl/extractor/patreon.py2
-rw-r--r--youtube_dl/extractor/pbs.py63
-rw-r--r--youtube_dl/extractor/people.py32
-rw-r--r--youtube_dl/extractor/periscope.py53
-rw-r--r--youtube_dl/extractor/philharmoniedeparis.py2
-rw-r--r--youtube_dl/extractor/photobucket.py2
-rw-r--r--youtube_dl/extractor/planetaplay.py61
-rw-r--r--youtube_dl/extractor/played.py4
-rw-r--r--youtube_dl/extractor/playtvak.py4
-rw-r--r--youtube_dl/extractor/playwire.py39
-rw-r--r--youtube_dl/extractor/pluralsight.py21
-rw-r--r--youtube_dl/extractor/porn91.py17
-rw-r--r--youtube_dl/extractor/pornhd.py60
-rw-r--r--youtube_dl/extractor/pornhub.py80
-rw-r--r--youtube_dl/extractor/pornovoisines.py2
-rw-r--r--youtube_dl/extractor/presstv.py74
-rw-r--r--youtube_dl/extractor/primesharetv.py4
-rw-r--r--youtube_dl/extractor/promptfile.py4
-rw-r--r--youtube_dl/extractor/prosiebensat1.py10
-rw-r--r--youtube_dl/extractor/puls4.py2
-rw-r--r--youtube_dl/extractor/pyvideo.py8
-rw-r--r--youtube_dl/extractor/qqmusic.py10
-rw-r--r--youtube_dl/extractor/quickvid.py54
-rw-r--r--youtube_dl/extractor/r7.py95
-rw-r--r--youtube_dl/extractor/radiocanada.py130
-rw-r--r--youtube_dl/extractor/radiojavan.py2
-rw-r--r--youtube_dl/extractor/rai.py4
-rw-r--r--youtube_dl/extractor/redtube.py60
-rw-r--r--youtube_dl/extractor/restudy.py1
-rw-r--r--youtube_dl/extractor/reuters.py69
-rw-r--r--youtube_dl/extractor/revision3.py165
-rw-r--r--youtube_dl/extractor/rice.py116
-rw-r--r--youtube_dl/extractor/ringtv.py2
-rw-r--r--youtube_dl/extractor/rockstargames.py69
-rw-r--r--youtube_dl/extractor/rottentomatoes.py24
-rw-r--r--youtube_dl/extractor/rtbf.py62
-rw-r--r--youtube_dl/extractor/rte.py12
-rw-r--r--youtube_dl/extractor/rtlnl.py68
-rw-r--r--youtube_dl/extractor/rtve.py42
-rw-r--r--youtube_dl/extractor/rtvnh.py1
-rw-r--r--youtube_dl/extractor/ruhd.py2
-rw-r--r--youtube_dl/extractor/rutube.py6
-rw-r--r--youtube_dl/extractor/rutv.py8
-rw-r--r--youtube_dl/extractor/safari.py106
-rw-r--r--youtube_dl/extractor/sbs.py35
-rw-r--r--youtube_dl/extractor/scivee.py1
-rw-r--r--youtube_dl/extractor/screencast.py13
-rw-r--r--youtube_dl/extractor/screencastomatic.py36
-rw-r--r--youtube_dl/extractor/screenjunkies.py2
-rw-r--r--youtube_dl/extractor/screenwavemedia.py20
-rw-r--r--youtube_dl/extractor/seeker.py57
-rw-r--r--youtube_dl/extractor/senateisvp.py2
-rw-r--r--youtube_dl/extractor/sendtonews.py86
-rw-r--r--youtube_dl/extractor/sexu.py25
-rw-r--r--youtube_dl/extractor/shahid.py5
-rw-r--r--youtube_dl/extractor/shared.py6
-rw-r--r--youtube_dl/extractor/sharesix.py4
-rw-r--r--youtube_dl/extractor/sina.py124
-rw-r--r--youtube_dl/extractor/smotri.py6
-rw-r--r--youtube_dl/extractor/sohu.py4
-rw-r--r--youtube_dl/extractor/soundcloud.py11
-rw-r--r--youtube_dl/extractor/space.py38
-rw-r--r--youtube_dl/extractor/spankwire.py22
-rw-r--r--youtube_dl/extractor/sport5.py2
-rw-r--r--youtube_dl/extractor/sportbox.py33
-rw-r--r--youtube_dl/extractor/sportschau.py38
-rw-r--r--youtube_dl/extractor/ssa.py2
-rw-r--r--youtube_dl/extractor/streamcloud.py43
-rw-r--r--youtube_dl/extractor/streetvoice.py8
-rw-r--r--youtube_dl/extractor/svt.py120
-rw-r--r--youtube_dl/extractor/sztvhu.py2
-rw-r--r--youtube_dl/extractor/tagesschau.py328
-rw-r--r--youtube_dl/extractor/tdslifeway.py33
-rw-r--r--youtube_dl/extractor/teachingchannel.py3
-rw-r--r--youtube_dl/extractor/teamcoco.py4
-rw-r--r--youtube_dl/extractor/ted.py68
-rw-r--r--youtube_dl/extractor/tele13.py2
-rw-r--r--youtube_dl/extractor/telebruxelles.py14
-rw-r--r--youtube_dl/extractor/telecinco.py5
-rw-r--r--youtube_dl/extractor/telegraaf.py58
-rw-r--r--youtube_dl/extractor/telewebion.py55
-rw-r--r--youtube_dl/extractor/tenplay.py90
-rw-r--r--youtube_dl/extractor/tf1.py2
-rw-r--r--youtube_dl/extractor/theonion.py63
-rw-r--r--youtube_dl/extractor/theplatform.py148
-rw-r--r--youtube_dl/extractor/thescene.py52
-rw-r--r--youtube_dl/extractor/thesixtyone.py6
-rw-r--r--youtube_dl/extractor/thestar.py35
-rw-r--r--youtube_dl/extractor/threeqsdn.py139
-rw-r--r--youtube_dl/extractor/thvideo.py2
-rw-r--r--youtube_dl/extractor/tinypic.py2
-rw-r--r--youtube_dl/extractor/tlc.py35
-rw-r--r--youtube_dl/extractor/tnaflix.py56
-rw-r--r--youtube_dl/extractor/toypics.py2
-rw-r--r--youtube_dl/extractor/traileraddict.py2
-rw-r--r--youtube_dl/extractor/trollvids.py2
-rw-r--r--youtube_dl/extractor/tubitv.py56
-rw-r--r--youtube_dl/extractor/tudou.py41
-rw-r--r--youtube_dl/extractor/tumblr.py30
-rw-r--r--youtube_dl/extractor/tunein.py9
-rw-r--r--youtube_dl/extractor/tv2.py4
-rw-r--r--youtube_dl/extractor/tv3.py34
-rw-r--r--youtube_dl/extractor/tvc.py4
-rw-r--r--youtube_dl/extractor/tvigle.py8
-rw-r--r--youtube_dl/extractor/tvp.py106
-rw-r--r--youtube_dl/extractor/tvplay.py2
-rw-r--r--youtube_dl/extractor/twentyfourvideo.py3
-rw-r--r--youtube_dl/extractor/twentymin.py23
-rw-r--r--youtube_dl/extractor/twitch.py109
-rw-r--r--youtube_dl/extractor/twitter.py219
-rw-r--r--youtube_dl/extractor/ubu.py57
-rw-r--r--youtube_dl/extractor/udemy.py256
-rw-r--r--youtube_dl/extractor/udn.py62
-rw-r--r--youtube_dl/extractor/unistra.py3
-rw-r--r--youtube_dl/extractor/usatoday.py48
-rw-r--r--youtube_dl/extractor/ustream.py16
-rw-r--r--youtube_dl/extractor/ustudio.py125
-rw-r--r--youtube_dl/extractor/varzesh3.py48
-rw-r--r--youtube_dl/extractor/vbox7.py10
-rw-r--r--youtube_dl/extractor/veoh.py3
-rw-r--r--youtube_dl/extractor/vessel.py32
-rw-r--r--youtube_dl/extractor/vesti.py2
-rw-r--r--youtube_dl/extractor/vevo.py200
-rw-r--r--youtube_dl/extractor/vgtv.py21
-rw-r--r--youtube_dl/extractor/vice.py99
-rw-r--r--youtube_dl/extractor/viddler.py4
-rw-r--r--youtube_dl/extractor/videodetective.py9
-rw-r--r--youtube_dl/extractor/videomega.py10
-rw-r--r--youtube_dl/extractor/videomore.py1
-rw-r--r--youtube_dl/extractor/videott.py2
-rw-r--r--youtube_dl/extractor/vidio.py73
-rw-r--r--youtube_dl/extractor/vidzi.py27
-rw-r--r--youtube_dl/extractor/vier.py1
-rw-r--r--youtube_dl/extractor/viewlift.py (renamed from youtube_dl/extractor/snagfilms.py)39
-rw-r--r--youtube_dl/extractor/viewster.py124
-rw-r--r--youtube_dl/extractor/viidea.py3
-rw-r--r--youtube_dl/extractor/viki.py28
-rw-r--r--youtube_dl/extractor/vimeo.py377
-rw-r--r--youtube_dl/extractor/vine.py5
-rw-r--r--youtube_dl/extractor/vk.py109
-rw-r--r--youtube_dl/extractor/vlive.py87
-rw-r--r--youtube_dl/extractor/vodlocker.py4
-rw-r--r--youtube_dl/extractor/voicerepublic.py11
-rw-r--r--youtube_dl/extractor/voxmedia.py136
-rw-r--r--youtube_dl/extractor/vporn.py10
-rw-r--r--youtube_dl/extractor/vrt.py34
-rw-r--r--youtube_dl/extractor/vube.py2
-rw-r--r--youtube_dl/extractor/vuclip.py2
-rw-r--r--youtube_dl/extractor/vulture.py69
-rw-r--r--youtube_dl/extractor/walla.py2
-rw-r--r--youtube_dl/extractor/washingtonpost.py144
-rw-r--r--youtube_dl/extractor/wat.py137
-rw-r--r--youtube_dl/extractor/watchindianporn.py (renamed from youtube_dl/extractor/sexykarma.py)63
-rw-r--r--youtube_dl/extractor/wayofthemaster.py52
-rw-r--r--youtube_dl/extractor/wdr.py403
-rw-r--r--youtube_dl/extractor/webofstories.py64
-rw-r--r--youtube_dl/extractor/weibo.py49
-rw-r--r--youtube_dl/extractor/weiqitv.py2
-rw-r--r--youtube_dl/extractor/wimp.py37
-rw-r--r--youtube_dl/extractor/wistia.py89
-rw-r--r--youtube_dl/extractor/wrzuta.py94
-rw-r--r--youtube_dl/extractor/wsj.py95
-rw-r--r--youtube_dl/extractor/xbef.py2
-rw-r--r--youtube_dl/extractor/xboxclips.py2
-rw-r--r--youtube_dl/extractor/xfileshare.py102
-rw-r--r--youtube_dl/extractor/xhamster.py82
-rw-r--r--youtube_dl/extractor/xiami.py168
-rw-r--r--youtube_dl/extractor/xminus.py45
-rw-r--r--youtube_dl/extractor/xnxx.py22
-rw-r--r--youtube_dl/extractor/xuite.py1
-rw-r--r--youtube_dl/extractor/xvideos.py43
-rw-r--r--youtube_dl/extractor/yahoo.py88
-rw-r--r--youtube_dl/extractor/yam.py2
-rw-r--r--youtube_dl/extractor/yandexmusic.py167
-rw-r--r--youtube_dl/extractor/ynet.py6
-rw-r--r--youtube_dl/extractor/youku.py72
-rw-r--r--youtube_dl/extractor/youporn.py17
-rw-r--r--youtube_dl/extractor/youtube.py385
-rw-r--r--youtube_dl/extractor/zdf.py12
-rw-r--r--youtube_dl/jsinterp.py2
-rw-r--r--youtube_dl/options.py33
-rw-r--r--youtube_dl/postprocessor/__init__.py2
-rw-r--r--youtube_dl/postprocessor/execafterdownload.py4
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py103
-rw-r--r--youtube_dl/postprocessor/xattrpp.py5
-rw-r--r--youtube_dl/socks.py271
-rw-r--r--youtube_dl/swfinterp.py18
-rw-r--r--youtube_dl/update.py9
-rw-r--r--youtube_dl/utils.py523
-rw-r--r--youtube_dl/version.py2
444 files changed, 19553 insertions, 7673 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index f432403..5036289 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -24,9 +24,6 @@ import time
import tokenize
import traceback
-if os.name == 'nt':
- import ctypes
-
from .compat import (
compat_basestring,
compat_cookiejar,
@@ -34,6 +31,7 @@ from .compat import (
compat_get_terminal_size,
compat_http_client,
compat_kwargs,
+ compat_os_name,
compat_str,
compat_tokenize_tokenize,
compat_urllib_error,
@@ -41,6 +39,8 @@ from .compat import (
compat_urllib_request_DataHandler,
)
from .utils import (
+ age_restricted,
+ args_to_str,
ContentTooShortError,
date_from_str,
DateRange,
@@ -60,13 +60,17 @@ from .utils import (
PagedList,
parse_filesize,
PerRequestProxyHandler,
- PostProcessingError,
platform_name,
+ PostProcessingError,
preferredencoding,
+ prepend_extension,
+ register_socks_protocols,
render_table,
+ replace_extension,
SameFileError,
sanitize_filename,
sanitize_path,
+ sanitize_url,
sanitized_Request,
std_headers,
subtitles_filename,
@@ -77,16 +81,13 @@ from .utils import (
write_string,
YoutubeDLCookieProcessor,
YoutubeDLHandler,
- prepend_extension,
- replace_extension,
- args_to_str,
- age_restricted,
)
from .cache import Cache
-from .extractor import get_info_extractor, gen_extractors
+from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
from .downloader import get_suitable_downloader
from .downloader.rtmp import rtmpdump_version
from .postprocessor import (
+ FFmpegFixupM3u8PP,
FFmpegFixupM4aPP,
FFmpegFixupStretchedPP,
FFmpegMergerPP,
@@ -95,6 +96,9 @@ from .postprocessor import (
)
from .version import __version__
+if compat_os_name == 'nt':
+ import ctypes
+
class YoutubeDL(object):
"""YoutubeDL class.
@@ -257,7 +261,9 @@ class YoutubeDL(object):
The following options determine which downloader is picked:
external_downloader: Executable of the external downloader to call.
None or unset for standard (built-in) downloader.
- hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv.
+ hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
+ if True, otherwise use ffmpeg/avconv if False, otherwise
+ use downloader suggested by extractor if None.
The following parameters are not used by YoutubeDL itself, they are used by
the downloader (see youtube_dl/downloader/common.py):
@@ -320,7 +326,7 @@ class YoutubeDL(object):
['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
self._output_channel = os.fdopen(master, 'rb')
except OSError as ose:
- if ose.errno == 2:
+ if ose.errno == errno.ENOENT:
self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.')
else:
raise
@@ -356,6 +362,8 @@ class YoutubeDL(object):
for ph in self.params.get('progress_hooks', []):
self.add_progress_hook(ph)
+ register_socks_protocols()
+
def warn_if_short_id(self, argv):
# short YouTube ID starting with dash?
idxs = [
@@ -375,8 +383,9 @@ class YoutubeDL(object):
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
- self._ies_instances[ie.ie_key()] = ie
- ie.set_downloader(self)
+ if not isinstance(ie, type):
+ self._ies_instances[ie.ie_key()] = ie
+ ie.set_downloader(self)
def get_info_extractor(self, ie_key):
"""
@@ -394,7 +403,7 @@ class YoutubeDL(object):
"""
Add the InfoExtractors returned by gen_extractors to the end of the list
"""
- for ie in gen_extractors():
+ for ie in gen_extractor_classes():
self.add_info_extractor(ie)
def add_post_processor(self, pp):
@@ -450,7 +459,7 @@ class YoutubeDL(object):
def to_console_title(self, message):
if not self.params.get('consoletitle', False):
return
- if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
+ if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
# c_wchar_p() might not be necessary if `message` is
# already of type unicode()
ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
@@ -521,7 +530,7 @@ class YoutubeDL(object):
else:
if self.params.get('no_warnings'):
return
- if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
+ if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
_msg_header = '\033[0;33mWARNING:\033[0m'
else:
_msg_header = 'WARNING:'
@@ -533,7 +542,7 @@ class YoutubeDL(object):
Do the same as trouble, but prefixes the message with 'ERROR:', colored
in red if stderr is a tty file.
'''
- if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt':
+ if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
_msg_header = '\033[0;31mERROR:\033[0m'
else:
_msg_header = 'ERROR:'
@@ -566,7 +575,7 @@ class YoutubeDL(object):
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
- template_dict['resolution'] = '?x%d' % template_dict['width']
+ template_dict['resolution'] = '%dx?' % template_dict['width']
sanitize = lambda k, v: sanitize_filename(
compat_str(v),
@@ -574,7 +583,7 @@ class YoutubeDL(object):
is_id=(k == 'id'))
template_dict = dict((k, sanitize(k, v))
for k, v in template_dict.items()
- if v is not None)
+ if v is not None and not isinstance(v, (list, tuple, dict)))
template_dict = collections.defaultdict(lambda: 'NA', template_dict)
outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
@@ -658,6 +667,7 @@ class YoutubeDL(object):
if not ie.suitable(url):
continue
+ ie = self.get_info_extractor(ie.ie_key())
if not ie.working():
self.report_warning('The program functionality for this site has been marked as broken, '
'and will probably not work.')
@@ -710,6 +720,7 @@ class YoutubeDL(object):
result_type = ie_result.get('_type', 'video')
if result_type in ('url', 'url_transparent'):
+ ie_result['url'] = sanitize_url(ie_result['url'])
extract_flat = self.params.get('extract_flat', False)
if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
extract_flat is True):
@@ -903,7 +914,7 @@ class YoutubeDL(object):
'*=': lambda attr, value: value in attr,
}
str_operator_rex = re.compile(r'''(?x)
- \s*(?P<key>ext|acodec|vcodec|container|protocol)
+ \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<value>[a-zA-Z0-9._-]+)
\s*$
@@ -1212,6 +1223,10 @@ class YoutubeDL(object):
if 'title' not in info_dict:
raise ExtractorError('Missing "title" field in extractor result')
+ if not isinstance(info_dict['id'], compat_str):
+ self.report_warning('"id" field is not a string - forcing string conversion')
+ info_dict['id'] = compat_str(info_dict['id'])
+
if 'playlist' not in info_dict:
# It isn't part of a playlist
info_dict['playlist'] = None
@@ -1227,12 +1242,20 @@ class YoutubeDL(object):
t.get('preference'), t.get('width'), t.get('height'),
t.get('id'), t.get('url')))
for i, t in enumerate(thumbnails):
+ t['url'] = sanitize_url(t['url'])
if t.get('width') and t.get('height'):
t['resolution'] = '%dx%d' % (t['width'], t['height'])
if t.get('id') is None:
t['id'] = '%d' % i
- if thumbnails and 'thumbnail' not in info_dict:
+ if self.params.get('list_thumbnails'):
+ self.list_thumbnails(info_dict)
+ return
+
+ thumbnail = info_dict.get('thumbnail')
+ if thumbnail:
+ info_dict['thumbnail'] = sanitize_url(thumbnail)
+ elif thumbnails:
info_dict['thumbnail'] = thumbnails[-1]['url']
if 'display_id' not in info_dict and 'id' in info_dict:
@@ -1257,6 +1280,8 @@ class YoutubeDL(object):
if subtitles:
for _, subtitle in subtitles.items():
for subtitle_format in subtitle:
+ if subtitle_format.get('url'):
+ subtitle_format['url'] = sanitize_url(subtitle_format['url'])
if 'ext' not in subtitle_format:
subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
@@ -1286,6 +1311,8 @@ class YoutubeDL(object):
if 'url' not in format:
raise ExtractorError('Missing "url" key in result (index %d)' % i)
+ format['url'] = sanitize_url(format['url'])
+
if format.get('format_id') is None:
format['format_id'] = compat_str(i)
else:
@@ -1333,9 +1360,6 @@ class YoutubeDL(object):
if self.params.get('listformats'):
self.list_formats(info_dict)
return
- if self.params.get('list_thumbnails'):
- self.list_thumbnails(info_dict)
- return
req_format = self.params.get('format')
if req_format is None:
@@ -1623,7 +1647,7 @@ class YoutubeDL(object):
# Just a single file
success = dl(filename, info_dict)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self.report_error('unable to download video data: %s' % str(err))
+ self.report_error('unable to download video data: %s' % error_to_compat_str(err))
return
except (OSError, IOError) as err:
raise UnavailableVideoError(err)
@@ -1631,12 +1655,14 @@ class YoutubeDL(object):
self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
return
- if success:
+ if success and filename != '-':
# Fixup content
fixup_policy = self.params.get('fixup')
if fixup_policy is None:
fixup_policy = 'detect_or_warn'
+ INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
+
stretched_ratio = info_dict.get('stretched_ratio')
if stretched_ratio is not None and stretched_ratio != 1:
if fixup_policy == 'warn':
@@ -1649,15 +1675,18 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(stretched_pp)
else:
self.report_warning(
- '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
- info_dict['id'], stretched_ratio))
+ '%s: Non-uniform pixel ratio (%s). %s'
+ % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
else:
assert fixup_policy in ('ignore', 'never')
- if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
+ if (info_dict.get('requested_formats') is None and
+ info_dict.get('container') == 'm4a_dash'):
if fixup_policy == 'warn':
- self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
- info_dict['id']))
+ self.report_warning(
+ '%s: writing DASH m4a. '
+ 'Only some players support this container.'
+ % info_dict['id'])
elif fixup_policy == 'detect_or_warn':
fixup_pp = FFmpegFixupM4aPP(self)
if fixup_pp.available:
@@ -1665,8 +1694,27 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(fixup_pp)
else:
self.report_warning(
- '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
- info_dict['id']))
+ '%s: writing DASH m4a. '
+ 'Only some players support this container. %s'
+ % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
+ else:
+ assert fixup_policy in ('ignore', 'never')
+
+ if (info_dict.get('protocol') == 'm3u8_native' or
+ info_dict.get('protocol') == 'm3u8' and
+ self.params.get('hls_prefer_native')):
+ if fixup_policy == 'warn':
+ self.report_warning('%s: malformated aac bitstream.' % (
+ info_dict['id']))
+ elif fixup_policy == 'detect_or_warn':
+ fixup_pp = FFmpegFixupM3u8PP(self)
+ if fixup_pp.available:
+ info_dict.setdefault('__postprocessors', [])
+ info_dict['__postprocessors'].append(fixup_pp)
+ else:
+ self.report_warning(
+ '%s: malformated aac bitstream. %s'
+ % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
else:
assert fixup_policy in ('ignore', 'never')
@@ -1809,7 +1857,7 @@ class YoutubeDL(object):
if fdict.get('language'):
if res:
res += ' '
- res += '[%s]' % fdict['language']
+ res += '[%s] ' % fdict['language']
if fdict.get('format_note') is not None:
res += fdict['format_note'] + ' '
if fdict.get('tbr') is not None:
@@ -1830,7 +1878,9 @@ class YoutubeDL(object):
if fdict.get('vbr') is not None:
res += '%4dk' % fdict['vbr']
if fdict.get('fps') is not None:
- res += ', %sfps' % fdict['fps']
+ if res:
+ res += ', '
+ res += '%sfps' % fdict['fps']
if fdict.get('acodec') is not None:
if res:
res += ', '
@@ -1873,13 +1923,8 @@ class YoutubeDL(object):
def list_thumbnails(self, info_dict):
thumbnails = info_dict.get('thumbnails')
if not thumbnails:
- tn_url = info_dict.get('thumbnail')
- if tn_url:
- thumbnails = [{'id': '0', 'url': tn_url}]
- else:
- self.to_screen(
- '[info] No thumbnails present for %s' % info_dict['id'])
- return
+ self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
+ return
self.to_screen(
'[info] Thumbnails for %s:' % info_dict['id'])
@@ -1924,6 +1969,8 @@ class YoutubeDL(object):
write_string(encoding_str, encoding=None)
self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
+ if _LAZY_LOADER:
+ self._write_string('[debug] Lazy loading extractors enabled' + '\n')
try:
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
@@ -1979,6 +2026,7 @@ class YoutubeDL(object):
if opts_cookiefile is None:
self.cookiejar = compat_cookiejar.CookieJar()
else:
+ opts_cookiefile = compat_expanduser(opts_cookiefile)
self.cookiejar = compat_cookiejar.MozillaCookieJar(
opts_cookiefile)
if os.access(opts_cookiefile, os.R_OK):
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index f5f0642..4905674 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -18,7 +18,6 @@ from .options import (
from .compat import (
compat_expanduser,
compat_getpass,
- compat_print,
compat_shlex_split,
workaround_optparse_bug9161,
)
@@ -67,16 +66,16 @@ def _real_main(argv=None):
# Custom HTTP headers
if opts.headers is not None:
for h in opts.headers:
- if h.find(':', 1) < 0:
+ if ':' not in h:
parser.error('wrong header formatting, it should be key:value, not "%s"' % h)
- key, value = h.split(':', 2)
+ key, value = h.split(':', 1)
if opts.verbose:
write_string('[debug] Adding header from command line option %s:%s\n' % (key, value))
std_headers[key] = value
# Dump user agent
if opts.dump_user_agent:
- compat_print(std_headers['User-Agent'])
+ write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
sys.exit(0)
# Batch file verification
@@ -86,7 +85,9 @@ def _real_main(argv=None):
if opts.batchfile == '-':
batchfd = sys.stdin
else:
- batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+ batchfd = io.open(
+ compat_expanduser(opts.batchfile),
+ 'r', encoding='utf-8', errors='ignore')
batch_urls = read_batch_urls(batchfd)
if opts.verbose:
write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n')
@@ -99,10 +100,10 @@ def _real_main(argv=None):
if opts.list_extractors:
for ie in list_extractors(opts.age_limit):
- compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))
+ write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '') + '\n', out=sys.stdout)
matchedUrls = [url for url in all_urls if ie.suitable(url)]
for mu in matchedUrls:
- compat_print(' ' + mu)
+ write_string(' ' + mu + '\n', out=sys.stdout)
sys.exit(0)
if opts.list_extractor_descriptions:
for ie in list_extractors(opts.age_limit):
@@ -115,7 +116,7 @@ def _real_main(argv=None):
_SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')
_COUNTS = ('', '5', '10', 'all')
desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))
- compat_print(desc)
+ write_string(desc + '\n', out=sys.stdout)
sys.exit(0)
# Conflicting, missing and erroneous options
@@ -144,14 +145,20 @@ def _real_main(argv=None):
if numeric_limit is None:
parser.error('invalid max_filesize specified')
opts.max_filesize = numeric_limit
- if opts.retries is not None:
- if opts.retries in ('inf', 'infinite'):
- opts_retries = float('inf')
+
+ def parse_retries(retries):
+ if retries in ('inf', 'infinite'):
+ parsed_retries = float('inf')
else:
try:
- opts_retries = int(opts.retries)
+ parsed_retries = int(retries)
except (TypeError, ValueError):
parser.error('invalid retry count specified')
+ return parsed_retries
+ if opts.retries is not None:
+ opts.retries = parse_retries(opts.retries)
+ if opts.fragment_retries is not None:
+ opts.fragment_retries = parse_retries(opts.fragment_retries)
if opts.buffersize is not None:
numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
if numeric_buffersize is None:
@@ -299,7 +306,8 @@ def _real_main(argv=None):
'force_generic_extractor': opts.force_generic_extractor,
'ratelimit': opts.ratelimit,
'nooverwrites': opts.nooverwrites,
- 'retries': opts_retries,
+ 'retries': opts.retries,
+ 'fragment_retries': opts.fragment_retries,
'buffersize': opts.buffersize,
'noresizebuffer': opts.noresizebuffer,
'continuedl': opts.continue_dl,
@@ -355,6 +363,7 @@ def _real_main(argv=None):
'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
'encoding': opts.encoding,
'extract_flat': opts.extract_flat,
+ 'mark_watched': opts.mark_watched,
'merge_output_format': opts.merge_output_format,
'postprocessors': postprocessors,
'fixup': opts.fixup,
@@ -396,7 +405,7 @@ def _real_main(argv=None):
try:
if opts.load_info_filename is not None:
- retcode = ydl.download_with_info_file(opts.load_info_filename)
+ retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename))
else:
retcode = ydl.download(all_urls)
except MaxDownloadsReached:
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index b497da6..67db1c7 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -11,6 +11,7 @@ import re
import shlex
import shutil
import socket
+import struct
import subprocess
import sys
import itertools
@@ -62,6 +63,2244 @@ try:
except ImportError: # Python 2
import htmlentitydefs as compat_html_entities
+try: # Python >= 3.3
+ compat_html_entities_html5 = compat_html_entities.html5
+except AttributeError:
+ # Copied from CPython 3.5.1 html/entities.py
+ compat_html_entities_html5 = {
+ 'Aacute': '\xc1',
+ 'aacute': '\xe1',
+ 'Aacute;': '\xc1',
+ 'aacute;': '\xe1',
+ 'Abreve;': '\u0102',
+ 'abreve;': '\u0103',
+ 'ac;': '\u223e',
+ 'acd;': '\u223f',
+ 'acE;': '\u223e\u0333',
+ 'Acirc': '\xc2',
+ 'acirc': '\xe2',
+ 'Acirc;': '\xc2',
+ 'acirc;': '\xe2',
+ 'acute': '\xb4',
+ 'acute;': '\xb4',
+ 'Acy;': '\u0410',
+ 'acy;': '\u0430',
+ 'AElig': '\xc6',
+ 'aelig': '\xe6',
+ 'AElig;': '\xc6',
+ 'aelig;': '\xe6',
+ 'af;': '\u2061',
+ 'Afr;': '\U0001d504',
+ 'afr;': '\U0001d51e',
+ 'Agrave': '\xc0',
+ 'agrave': '\xe0',
+ 'Agrave;': '\xc0',
+ 'agrave;': '\xe0',
+ 'alefsym;': '\u2135',
+ 'aleph;': '\u2135',
+ 'Alpha;': '\u0391',
+ 'alpha;': '\u03b1',
+ 'Amacr;': '\u0100',
+ 'amacr;': '\u0101',
+ 'amalg;': '\u2a3f',
+ 'AMP': '&',
+ 'amp': '&',
+ 'AMP;': '&',
+ 'amp;': '&',
+ 'And;': '\u2a53',
+ 'and;': '\u2227',
+ 'andand;': '\u2a55',
+ 'andd;': '\u2a5c',
+ 'andslope;': '\u2a58',
+ 'andv;': '\u2a5a',
+ 'ang;': '\u2220',
+ 'ange;': '\u29a4',
+ 'angle;': '\u2220',
+ 'angmsd;': '\u2221',
+ 'angmsdaa;': '\u29a8',
+ 'angmsdab;': '\u29a9',
+ 'angmsdac;': '\u29aa',
+ 'angmsdad;': '\u29ab',
+ 'angmsdae;': '\u29ac',
+ 'angmsdaf;': '\u29ad',
+ 'angmsdag;': '\u29ae',
+ 'angmsdah;': '\u29af',
+ 'angrt;': '\u221f',
+ 'angrtvb;': '\u22be',
+ 'angrtvbd;': '\u299d',
+ 'angsph;': '\u2222',
+ 'angst;': '\xc5',
+ 'angzarr;': '\u237c',
+ 'Aogon;': '\u0104',
+ 'aogon;': '\u0105',
+ 'Aopf;': '\U0001d538',
+ 'aopf;': '\U0001d552',
+ 'ap;': '\u2248',
+ 'apacir;': '\u2a6f',
+ 'apE;': '\u2a70',
+ 'ape;': '\u224a',
+ 'apid;': '\u224b',
+ 'apos;': "'",
+ 'ApplyFunction;': '\u2061',
+ 'approx;': '\u2248',
+ 'approxeq;': '\u224a',
+ 'Aring': '\xc5',
+ 'aring': '\xe5',
+ 'Aring;': '\xc5',
+ 'aring;': '\xe5',
+ 'Ascr;': '\U0001d49c',
+ 'ascr;': '\U0001d4b6',
+ 'Assign;': '\u2254',
+ 'ast;': '*',
+ 'asymp;': '\u2248',
+ 'asympeq;': '\u224d',
+ 'Atilde': '\xc3',
+ 'atilde': '\xe3',
+ 'Atilde;': '\xc3',
+ 'atilde;': '\xe3',
+ 'Auml': '\xc4',
+ 'auml': '\xe4',
+ 'Auml;': '\xc4',
+ 'auml;': '\xe4',
+ 'awconint;': '\u2233',
+ 'awint;': '\u2a11',
+ 'backcong;': '\u224c',
+ 'backepsilon;': '\u03f6',
+ 'backprime;': '\u2035',
+ 'backsim;': '\u223d',
+ 'backsimeq;': '\u22cd',
+ 'Backslash;': '\u2216',
+ 'Barv;': '\u2ae7',
+ 'barvee;': '\u22bd',
+ 'Barwed;': '\u2306',
+ 'barwed;': '\u2305',
+ 'barwedge;': '\u2305',
+ 'bbrk;': '\u23b5',
+ 'bbrktbrk;': '\u23b6',
+ 'bcong;': '\u224c',
+ 'Bcy;': '\u0411',
+ 'bcy;': '\u0431',
+ 'bdquo;': '\u201e',
+ 'becaus;': '\u2235',
+ 'Because;': '\u2235',
+ 'because;': '\u2235',
+ 'bemptyv;': '\u29b0',
+ 'bepsi;': '\u03f6',
+ 'bernou;': '\u212c',
+ 'Bernoullis;': '\u212c',
+ 'Beta;': '\u0392',
+ 'beta;': '\u03b2',
+ 'beth;': '\u2136',
+ 'between;': '\u226c',
+ 'Bfr;': '\U0001d505',
+ 'bfr;': '\U0001d51f',
+ 'bigcap;': '\u22c2',
+ 'bigcirc;': '\u25ef',
+ 'bigcup;': '\u22c3',
+ 'bigodot;': '\u2a00',
+ 'bigoplus;': '\u2a01',
+ 'bigotimes;': '\u2a02',
+ 'bigsqcup;': '\u2a06',
+ 'bigstar;': '\u2605',
+ 'bigtriangledown;': '\u25bd',
+ 'bigtriangleup;': '\u25b3',
+ 'biguplus;': '\u2a04',
+ 'bigvee;': '\u22c1',
+ 'bigwedge;': '\u22c0',
+ 'bkarow;': '\u290d',
+ 'blacklozenge;': '\u29eb',
+ 'blacksquare;': '\u25aa',
+ 'blacktriangle;': '\u25b4',
+ 'blacktriangledown;': '\u25be',
+ 'blacktriangleleft;': '\u25c2',
+ 'blacktriangleright;': '\u25b8',
+ 'blank;': '\u2423',
+ 'blk12;': '\u2592',
+ 'blk14;': '\u2591',
+ 'blk34;': '\u2593',
+ 'block;': '\u2588',
+ 'bne;': '=\u20e5',
+ 'bnequiv;': '\u2261\u20e5',
+ 'bNot;': '\u2aed',
+ 'bnot;': '\u2310',
+ 'Bopf;': '\U0001d539',
+ 'bopf;': '\U0001d553',
+ 'bot;': '\u22a5',
+ 'bottom;': '\u22a5',
+ 'bowtie;': '\u22c8',
+ 'boxbox;': '\u29c9',
+ 'boxDL;': '\u2557',
+ 'boxDl;': '\u2556',
+ 'boxdL;': '\u2555',
+ 'boxdl;': '\u2510',
+ 'boxDR;': '\u2554',
+ 'boxDr;': '\u2553',
+ 'boxdR;': '\u2552',
+ 'boxdr;': '\u250c',
+ 'boxH;': '\u2550',
+ 'boxh;': '\u2500',
+ 'boxHD;': '\u2566',
+ 'boxHd;': '\u2564',
+ 'boxhD;': '\u2565',
+ 'boxhd;': '\u252c',
+ 'boxHU;': '\u2569',
+ 'boxHu;': '\u2567',
+ 'boxhU;': '\u2568',
+ 'boxhu;': '\u2534',
+ 'boxminus;': '\u229f',
+ 'boxplus;': '\u229e',
+ 'boxtimes;': '\u22a0',
+ 'boxUL;': '\u255d',
+ 'boxUl;': '\u255c',
+ 'boxuL;': '\u255b',
+ 'boxul;': '\u2518',
+ 'boxUR;': '\u255a',
+ 'boxUr;': '\u2559',
+ 'boxuR;': '\u2558',
+ 'boxur;': '\u2514',
+ 'boxV;': '\u2551',
+ 'boxv;': '\u2502',
+ 'boxVH;': '\u256c',
+ 'boxVh;': '\u256b',
+ 'boxvH;': '\u256a',
+ 'boxvh;': '\u253c',
+ 'boxVL;': '\u2563',
+ 'boxVl;': '\u2562',
+ 'boxvL;': '\u2561',
+ 'boxvl;': '\u2524',
+ 'boxVR;': '\u2560',
+ 'boxVr;': '\u255f',
+ 'boxvR;': '\u255e',
+ 'boxvr;': '\u251c',
+ 'bprime;': '\u2035',
+ 'Breve;': '\u02d8',
+ 'breve;': '\u02d8',
+ 'brvbar': '\xa6',
+ 'brvbar;': '\xa6',
+ 'Bscr;': '\u212c',
+ 'bscr;': '\U0001d4b7',
+ 'bsemi;': '\u204f',
+ 'bsim;': '\u223d',
+ 'bsime;': '\u22cd',
+ 'bsol;': '\\',
+ 'bsolb;': '\u29c5',
+ 'bsolhsub;': '\u27c8',
+ 'bull;': '\u2022',
+ 'bullet;': '\u2022',
+ 'bump;': '\u224e',
+ 'bumpE;': '\u2aae',
+ 'bumpe;': '\u224f',
+ 'Bumpeq;': '\u224e',
+ 'bumpeq;': '\u224f',
+ 'Cacute;': '\u0106',
+ 'cacute;': '\u0107',
+ 'Cap;': '\u22d2',
+ 'cap;': '\u2229',
+ 'capand;': '\u2a44',
+ 'capbrcup;': '\u2a49',
+ 'capcap;': '\u2a4b',
+ 'capcup;': '\u2a47',
+ 'capdot;': '\u2a40',
+ 'CapitalDifferentialD;': '\u2145',
+ 'caps;': '\u2229\ufe00',
+ 'caret;': '\u2041',
+ 'caron;': '\u02c7',
+ 'Cayleys;': '\u212d',
+ 'ccaps;': '\u2a4d',
+ 'Ccaron;': '\u010c',
+ 'ccaron;': '\u010d',
+ 'Ccedil': '\xc7',
+ 'ccedil': '\xe7',
+ 'Ccedil;': '\xc7',
+ 'ccedil;': '\xe7',
+ 'Ccirc;': '\u0108',
+ 'ccirc;': '\u0109',
+ 'Cconint;': '\u2230',
+ 'ccups;': '\u2a4c',
+ 'ccupssm;': '\u2a50',
+ 'Cdot;': '\u010a',
+ 'cdot;': '\u010b',
+ 'cedil': '\xb8',
+ 'cedil;': '\xb8',
+ 'Cedilla;': '\xb8',
+ 'cemptyv;': '\u29b2',
+ 'cent': '\xa2',
+ 'cent;': '\xa2',
+ 'CenterDot;': '\xb7',
+ 'centerdot;': '\xb7',
+ 'Cfr;': '\u212d',
+ 'cfr;': '\U0001d520',
+ 'CHcy;': '\u0427',
+ 'chcy;': '\u0447',
+ 'check;': '\u2713',
+ 'checkmark;': '\u2713',
+ 'Chi;': '\u03a7',
+ 'chi;': '\u03c7',
+ 'cir;': '\u25cb',
+ 'circ;': '\u02c6',
+ 'circeq;': '\u2257',
+ 'circlearrowleft;': '\u21ba',
+ 'circlearrowright;': '\u21bb',
+ 'circledast;': '\u229b',
+ 'circledcirc;': '\u229a',
+ 'circleddash;': '\u229d',
+ 'CircleDot;': '\u2299',
+ 'circledR;': '\xae',
+ 'circledS;': '\u24c8',
+ 'CircleMinus;': '\u2296',
+ 'CirclePlus;': '\u2295',
+ 'CircleTimes;': '\u2297',
+ 'cirE;': '\u29c3',
+ 'cire;': '\u2257',
+ 'cirfnint;': '\u2a10',
+ 'cirmid;': '\u2aef',
+ 'cirscir;': '\u29c2',
+ 'ClockwiseContourIntegral;': '\u2232',
+ 'CloseCurlyDoubleQuote;': '\u201d',
+ 'CloseCurlyQuote;': '\u2019',
+ 'clubs;': '\u2663',
+ 'clubsuit;': '\u2663',
+ 'Colon;': '\u2237',
+ 'colon;': ':',
+ 'Colone;': '\u2a74',
+ 'colone;': '\u2254',
+ 'coloneq;': '\u2254',
+ 'comma;': ',',
+ 'commat;': '@',
+ 'comp;': '\u2201',
+ 'compfn;': '\u2218',
+ 'complement;': '\u2201',
+ 'complexes;': '\u2102',
+ 'cong;': '\u2245',
+ 'congdot;': '\u2a6d',
+ 'Congruent;': '\u2261',
+ 'Conint;': '\u222f',
+ 'conint;': '\u222e',
+ 'ContourIntegral;': '\u222e',
+ 'Copf;': '\u2102',
+ 'copf;': '\U0001d554',
+ 'coprod;': '\u2210',
+ 'Coproduct;': '\u2210',
+ 'COPY': '\xa9',
+ 'copy': '\xa9',
+ 'COPY;': '\xa9',
+ 'copy;': '\xa9',
+ 'copysr;': '\u2117',
+ 'CounterClockwiseContourIntegral;': '\u2233',
+ 'crarr;': '\u21b5',
+ 'Cross;': '\u2a2f',
+ 'cross;': '\u2717',
+ 'Cscr;': '\U0001d49e',
+ 'cscr;': '\U0001d4b8',
+ 'csub;': '\u2acf',
+ 'csube;': '\u2ad1',
+ 'csup;': '\u2ad0',
+ 'csupe;': '\u2ad2',
+ 'ctdot;': '\u22ef',
+ 'cudarrl;': '\u2938',
+ 'cudarrr;': '\u2935',
+ 'cuepr;': '\u22de',
+ 'cuesc;': '\u22df',
+ 'cularr;': '\u21b6',
+ 'cularrp;': '\u293d',
+ 'Cup;': '\u22d3',
+ 'cup;': '\u222a',
+ 'cupbrcap;': '\u2a48',
+ 'CupCap;': '\u224d',
+ 'cupcap;': '\u2a46',
+ 'cupcup;': '\u2a4a',
+ 'cupdot;': '\u228d',
+ 'cupor;': '\u2a45',
+ 'cups;': '\u222a\ufe00',
+ 'curarr;': '\u21b7',
+ 'curarrm;': '\u293c',
+ 'curlyeqprec;': '\u22de',
+ 'curlyeqsucc;': '\u22df',
+ 'curlyvee;': '\u22ce',
+ 'curlywedge;': '\u22cf',
+ 'curren': '\xa4',
+ 'curren;': '\xa4',
+ 'curvearrowleft;': '\u21b6',
+ 'curvearrowright;': '\u21b7',
+ 'cuvee;': '\u22ce',
+ 'cuwed;': '\u22cf',
+ 'cwconint;': '\u2232',
+ 'cwint;': '\u2231',
+ 'cylcty;': '\u232d',
+ 'Dagger;': '\u2021',
+ 'dagger;': '\u2020',
+ 'daleth;': '\u2138',
+ 'Darr;': '\u21a1',
+ 'dArr;': '\u21d3',
+ 'darr;': '\u2193',
+ 'dash;': '\u2010',
+ 'Dashv;': '\u2ae4',
+ 'dashv;': '\u22a3',
+ 'dbkarow;': '\u290f',
+ 'dblac;': '\u02dd',
+ 'Dcaron;': '\u010e',
+ 'dcaron;': '\u010f',
+ 'Dcy;': '\u0414',
+ 'dcy;': '\u0434',
+ 'DD;': '\u2145',
+ 'dd;': '\u2146',
+ 'ddagger;': '\u2021',
+ 'ddarr;': '\u21ca',
+ 'DDotrahd;': '\u2911',
+ 'ddotseq;': '\u2a77',
+ 'deg': '\xb0',
+ 'deg;': '\xb0',
+ 'Del;': '\u2207',
+ 'Delta;': '\u0394',
+ 'delta;': '\u03b4',
+ 'demptyv;': '\u29b1',
+ 'dfisht;': '\u297f',
+ 'Dfr;': '\U0001d507',
+ 'dfr;': '\U0001d521',
+ 'dHar;': '\u2965',
+ 'dharl;': '\u21c3',
+ 'dharr;': '\u21c2',
+ 'DiacriticalAcute;': '\xb4',
+ 'DiacriticalDot;': '\u02d9',
+ 'DiacriticalDoubleAcute;': '\u02dd',
+ 'DiacriticalGrave;': '`',
+ 'DiacriticalTilde;': '\u02dc',
+ 'diam;': '\u22c4',
+ 'Diamond;': '\u22c4',
+ 'diamond;': '\u22c4',
+ 'diamondsuit;': '\u2666',
+ 'diams;': '\u2666',
+ 'die;': '\xa8',
+ 'DifferentialD;': '\u2146',
+ 'digamma;': '\u03dd',
+ 'disin;': '\u22f2',
+ 'div;': '\xf7',
+ 'divide': '\xf7',
+ 'divide;': '\xf7',
+ 'divideontimes;': '\u22c7',
+ 'divonx;': '\u22c7',
+ 'DJcy;': '\u0402',
+ 'djcy;': '\u0452',
+ 'dlcorn;': '\u231e',
+ 'dlcrop;': '\u230d',
+ 'dollar;': '$',
+ 'Dopf;': '\U0001d53b',
+ 'dopf;': '\U0001d555',
+ 'Dot;': '\xa8',
+ 'dot;': '\u02d9',
+ 'DotDot;': '\u20dc',
+ 'doteq;': '\u2250',
+ 'doteqdot;': '\u2251',
+ 'DotEqual;': '\u2250',
+ 'dotminus;': '\u2238',
+ 'dotplus;': '\u2214',
+ 'dotsquare;': '\u22a1',
+ 'doublebarwedge;': '\u2306',
+ 'DoubleContourIntegral;': '\u222f',
+ 'DoubleDot;': '\xa8',
+ 'DoubleDownArrow;': '\u21d3',
+ 'DoubleLeftArrow;': '\u21d0',
+ 'DoubleLeftRightArrow;': '\u21d4',
+ 'DoubleLeftTee;': '\u2ae4',
+ 'DoubleLongLeftArrow;': '\u27f8',
+ 'DoubleLongLeftRightArrow;': '\u27fa',
+ 'DoubleLongRightArrow;': '\u27f9',
+ 'DoubleRightArrow;': '\u21d2',
+ 'DoubleRightTee;': '\u22a8',
+ 'DoubleUpArrow;': '\u21d1',
+ 'DoubleUpDownArrow;': '\u21d5',
+ 'DoubleVerticalBar;': '\u2225',
+ 'DownArrow;': '\u2193',
+ 'Downarrow;': '\u21d3',
+ 'downarrow;': '\u2193',
+ 'DownArrowBar;': '\u2913',
+ 'DownArrowUpArrow;': '\u21f5',
+ 'DownBreve;': '\u0311',
+ 'downdownarrows;': '\u21ca',
+ 'downharpoonleft;': '\u21c3',
+ 'downharpoonright;': '\u21c2',
+ 'DownLeftRightVector;': '\u2950',
+ 'DownLeftTeeVector;': '\u295e',
+ 'DownLeftVector;': '\u21bd',
+ 'DownLeftVectorBar;': '\u2956',
+ 'DownRightTeeVector;': '\u295f',
+ 'DownRightVector;': '\u21c1',
+ 'DownRightVectorBar;': '\u2957',
+ 'DownTee;': '\u22a4',
+ 'DownTeeArrow;': '\u21a7',
+ 'drbkarow;': '\u2910',
+ 'drcorn;': '\u231f',
+ 'drcrop;': '\u230c',
+ 'Dscr;': '\U0001d49f',
+ 'dscr;': '\U0001d4b9',
+ 'DScy;': '\u0405',
+ 'dscy;': '\u0455',
+ 'dsol;': '\u29f6',
+ 'Dstrok;': '\u0110',
+ 'dstrok;': '\u0111',
+ 'dtdot;': '\u22f1',
+ 'dtri;': '\u25bf',
+ 'dtrif;': '\u25be',
+ 'duarr;': '\u21f5',
+ 'duhar;': '\u296f',
+ 'dwangle;': '\u29a6',
+ 'DZcy;': '\u040f',
+ 'dzcy;': '\u045f',
+ 'dzigrarr;': '\u27ff',
+ 'Eacute': '\xc9',
+ 'eacute': '\xe9',
+ 'Eacute;': '\xc9',
+ 'eacute;': '\xe9',
+ 'easter;': '\u2a6e',
+ 'Ecaron;': '\u011a',
+ 'ecaron;': '\u011b',
+ 'ecir;': '\u2256',
+ 'Ecirc': '\xca',
+ 'ecirc': '\xea',
+ 'Ecirc;': '\xca',
+ 'ecirc;': '\xea',
+ 'ecolon;': '\u2255',
+ 'Ecy;': '\u042d',
+ 'ecy;': '\u044d',
+ 'eDDot;': '\u2a77',
+ 'Edot;': '\u0116',
+ 'eDot;': '\u2251',
+ 'edot;': '\u0117',
+ 'ee;': '\u2147',
+ 'efDot;': '\u2252',
+ 'Efr;': '\U0001d508',
+ 'efr;': '\U0001d522',
+ 'eg;': '\u2a9a',
+ 'Egrave': '\xc8',
+ 'egrave': '\xe8',
+ 'Egrave;': '\xc8',
+ 'egrave;': '\xe8',
+ 'egs;': '\u2a96',
+ 'egsdot;': '\u2a98',
+ 'el;': '\u2a99',
+ 'Element;': '\u2208',
+ 'elinters;': '\u23e7',
+ 'ell;': '\u2113',
+ 'els;': '\u2a95',
+ 'elsdot;': '\u2a97',
+ 'Emacr;': '\u0112',
+ 'emacr;': '\u0113',
+ 'empty;': '\u2205',
+ 'emptyset;': '\u2205',
+ 'EmptySmallSquare;': '\u25fb',
+ 'emptyv;': '\u2205',
+ 'EmptyVerySmallSquare;': '\u25ab',
+ 'emsp13;': '\u2004',
+ 'emsp14;': '\u2005',
+ 'emsp;': '\u2003',
+ 'ENG;': '\u014a',
+ 'eng;': '\u014b',
+ 'ensp;': '\u2002',
+ 'Eogon;': '\u0118',
+ 'eogon;': '\u0119',
+ 'Eopf;': '\U0001d53c',
+ 'eopf;': '\U0001d556',
+ 'epar;': '\u22d5',
+ 'eparsl;': '\u29e3',
+ 'eplus;': '\u2a71',
+ 'epsi;': '\u03b5',
+ 'Epsilon;': '\u0395',
+ 'epsilon;': '\u03b5',
+ 'epsiv;': '\u03f5',
+ 'eqcirc;': '\u2256',
+ 'eqcolon;': '\u2255',
+ 'eqsim;': '\u2242',
+ 'eqslantgtr;': '\u2a96',
+ 'eqslantless;': '\u2a95',
+ 'Equal;': '\u2a75',
+ 'equals;': '=',
+ 'EqualTilde;': '\u2242',
+ 'equest;': '\u225f',
+ 'Equilibrium;': '\u21cc',
+ 'equiv;': '\u2261',
+ 'equivDD;': '\u2a78',
+ 'eqvparsl;': '\u29e5',
+ 'erarr;': '\u2971',
+ 'erDot;': '\u2253',
+ 'Escr;': '\u2130',
+ 'escr;': '\u212f',
+ 'esdot;': '\u2250',
+ 'Esim;': '\u2a73',
+ 'esim;': '\u2242',
+ 'Eta;': '\u0397',
+ 'eta;': '\u03b7',
+ 'ETH': '\xd0',
+ 'eth': '\xf0',
+ 'ETH;': '\xd0',
+ 'eth;': '\xf0',
+ 'Euml': '\xcb',
+ 'euml': '\xeb',
+ 'Euml;': '\xcb',
+ 'euml;': '\xeb',
+ 'euro;': '\u20ac',
+ 'excl;': '!',
+ 'exist;': '\u2203',
+ 'Exists;': '\u2203',
+ 'expectation;': '\u2130',
+ 'ExponentialE;': '\u2147',
+ 'exponentiale;': '\u2147',
+ 'fallingdotseq;': '\u2252',
+ 'Fcy;': '\u0424',
+ 'fcy;': '\u0444',
+ 'female;': '\u2640',
+ 'ffilig;': '\ufb03',
+ 'fflig;': '\ufb00',
+ 'ffllig;': '\ufb04',
+ 'Ffr;': '\U0001d509',
+ 'ffr;': '\U0001d523',
+ 'filig;': '\ufb01',
+ 'FilledSmallSquare;': '\u25fc',
+ 'FilledVerySmallSquare;': '\u25aa',
+ 'fjlig;': 'fj',
+ 'flat;': '\u266d',
+ 'fllig;': '\ufb02',
+ 'fltns;': '\u25b1',
+ 'fnof;': '\u0192',
+ 'Fopf;': '\U0001d53d',
+ 'fopf;': '\U0001d557',
+ 'ForAll;': '\u2200',
+ 'forall;': '\u2200',
+ 'fork;': '\u22d4',
+ 'forkv;': '\u2ad9',
+ 'Fouriertrf;': '\u2131',
+ 'fpartint;': '\u2a0d',
+ 'frac12': '\xbd',
+ 'frac12;': '\xbd',
+ 'frac13;': '\u2153',
+ 'frac14': '\xbc',
+ 'frac14;': '\xbc',
+ 'frac15;': '\u2155',
+ 'frac16;': '\u2159',
+ 'frac18;': '\u215b',
+ 'frac23;': '\u2154',
+ 'frac25;': '\u2156',
+ 'frac34': '\xbe',
+ 'frac34;': '\xbe',
+ 'frac35;': '\u2157',
+ 'frac38;': '\u215c',
+ 'frac45;': '\u2158',
+ 'frac56;': '\u215a',
+ 'frac58;': '\u215d',
+ 'frac78;': '\u215e',
+ 'frasl;': '\u2044',
+ 'frown;': '\u2322',
+ 'Fscr;': '\u2131',
+ 'fscr;': '\U0001d4bb',
+ 'gacute;': '\u01f5',
+ 'Gamma;': '\u0393',
+ 'gamma;': '\u03b3',
+ 'Gammad;': '\u03dc',
+ 'gammad;': '\u03dd',
+ 'gap;': '\u2a86',
+ 'Gbreve;': '\u011e',
+ 'gbreve;': '\u011f',
+ 'Gcedil;': '\u0122',
+ 'Gcirc;': '\u011c',
+ 'gcirc;': '\u011d',
+ 'Gcy;': '\u0413',
+ 'gcy;': '\u0433',
+ 'Gdot;': '\u0120',
+ 'gdot;': '\u0121',
+ 'gE;': '\u2267',
+ 'ge;': '\u2265',
+ 'gEl;': '\u2a8c',
+ 'gel;': '\u22db',
+ 'geq;': '\u2265',
+ 'geqq;': '\u2267',
+ 'geqslant;': '\u2a7e',
+ 'ges;': '\u2a7e',
+ 'gescc;': '\u2aa9',
+ 'gesdot;': '\u2a80',
+ 'gesdoto;': '\u2a82',
+ 'gesdotol;': '\u2a84',
+ 'gesl;': '\u22db\ufe00',
+ 'gesles;': '\u2a94',
+ 'Gfr;': '\U0001d50a',
+ 'gfr;': '\U0001d524',
+ 'Gg;': '\u22d9',
+ 'gg;': '\u226b',
+ 'ggg;': '\u22d9',
+ 'gimel;': '\u2137',
+ 'GJcy;': '\u0403',
+ 'gjcy;': '\u0453',
+ 'gl;': '\u2277',
+ 'gla;': '\u2aa5',
+ 'glE;': '\u2a92',
+ 'glj;': '\u2aa4',
+ 'gnap;': '\u2a8a',
+ 'gnapprox;': '\u2a8a',
+ 'gnE;': '\u2269',
+ 'gne;': '\u2a88',
+ 'gneq;': '\u2a88',
+ 'gneqq;': '\u2269',
+ 'gnsim;': '\u22e7',
+ 'Gopf;': '\U0001d53e',
+ 'gopf;': '\U0001d558',
+ 'grave;': '`',
+ 'GreaterEqual;': '\u2265',
+ 'GreaterEqualLess;': '\u22db',
+ 'GreaterFullEqual;': '\u2267',
+ 'GreaterGreater;': '\u2aa2',
+ 'GreaterLess;': '\u2277',
+ 'GreaterSlantEqual;': '\u2a7e',
+ 'GreaterTilde;': '\u2273',
+ 'Gscr;': '\U0001d4a2',
+ 'gscr;': '\u210a',
+ 'gsim;': '\u2273',
+ 'gsime;': '\u2a8e',
+ 'gsiml;': '\u2a90',
+ 'GT': '>',
+ 'gt': '>',
+ 'GT;': '>',
+ 'Gt;': '\u226b',
+ 'gt;': '>',
+ 'gtcc;': '\u2aa7',
+ 'gtcir;': '\u2a7a',
+ 'gtdot;': '\u22d7',
+ 'gtlPar;': '\u2995',
+ 'gtquest;': '\u2a7c',
+ 'gtrapprox;': '\u2a86',
+ 'gtrarr;': '\u2978',
+ 'gtrdot;': '\u22d7',
+ 'gtreqless;': '\u22db',
+ 'gtreqqless;': '\u2a8c',
+ 'gtrless;': '\u2277',
+ 'gtrsim;': '\u2273',
+ 'gvertneqq;': '\u2269\ufe00',
+ 'gvnE;': '\u2269\ufe00',
+ 'Hacek;': '\u02c7',
+ 'hairsp;': '\u200a',
+ 'half;': '\xbd',
+ 'hamilt;': '\u210b',
+ 'HARDcy;': '\u042a',
+ 'hardcy;': '\u044a',
+ 'hArr;': '\u21d4',
+ 'harr;': '\u2194',
+ 'harrcir;': '\u2948',
+ 'harrw;': '\u21ad',
+ 'Hat;': '^',
+ 'hbar;': '\u210f',
+ 'Hcirc;': '\u0124',
+ 'hcirc;': '\u0125',
+ 'hearts;': '\u2665',
+ 'heartsuit;': '\u2665',
+ 'hellip;': '\u2026',
+ 'hercon;': '\u22b9',
+ 'Hfr;': '\u210c',
+ 'hfr;': '\U0001d525',
+ 'HilbertSpace;': '\u210b',
+ 'hksearow;': '\u2925',
+ 'hkswarow;': '\u2926',
+ 'hoarr;': '\u21ff',
+ 'homtht;': '\u223b',
+ 'hookleftarrow;': '\u21a9',
+ 'hookrightarrow;': '\u21aa',
+ 'Hopf;': '\u210d',
+ 'hopf;': '\U0001d559',
+ 'horbar;': '\u2015',
+ 'HorizontalLine;': '\u2500',
+ 'Hscr;': '\u210b',
+ 'hscr;': '\U0001d4bd',
+ 'hslash;': '\u210f',
+ 'Hstrok;': '\u0126',
+ 'hstrok;': '\u0127',
+ 'HumpDownHump;': '\u224e',
+ 'HumpEqual;': '\u224f',
+ 'hybull;': '\u2043',
+ 'hyphen;': '\u2010',
+ 'Iacute': '\xcd',
+ 'iacute': '\xed',
+ 'Iacute;': '\xcd',
+ 'iacute;': '\xed',
+ 'ic;': '\u2063',
+ 'Icirc': '\xce',
+ 'icirc': '\xee',
+ 'Icirc;': '\xce',
+ 'icirc;': '\xee',
+ 'Icy;': '\u0418',
+ 'icy;': '\u0438',
+ 'Idot;': '\u0130',
+ 'IEcy;': '\u0415',
+ 'iecy;': '\u0435',
+ 'iexcl': '\xa1',
+ 'iexcl;': '\xa1',
+ 'iff;': '\u21d4',
+ 'Ifr;': '\u2111',
+ 'ifr;': '\U0001d526',
+ 'Igrave': '\xcc',
+ 'igrave': '\xec',
+ 'Igrave;': '\xcc',
+ 'igrave;': '\xec',
+ 'ii;': '\u2148',
+ 'iiiint;': '\u2a0c',
+ 'iiint;': '\u222d',
+ 'iinfin;': '\u29dc',
+ 'iiota;': '\u2129',
+ 'IJlig;': '\u0132',
+ 'ijlig;': '\u0133',
+ 'Im;': '\u2111',
+ 'Imacr;': '\u012a',
+ 'imacr;': '\u012b',
+ 'image;': '\u2111',
+ 'ImaginaryI;': '\u2148',
+ 'imagline;': '\u2110',
+ 'imagpart;': '\u2111',
+ 'imath;': '\u0131',
+ 'imof;': '\u22b7',
+ 'imped;': '\u01b5',
+ 'Implies;': '\u21d2',
+ 'in;': '\u2208',
+ 'incare;': '\u2105',
+ 'infin;': '\u221e',
+ 'infintie;': '\u29dd',
+ 'inodot;': '\u0131',
+ 'Int;': '\u222c',
+ 'int;': '\u222b',
+ 'intcal;': '\u22ba',
+ 'integers;': '\u2124',
+ 'Integral;': '\u222b',
+ 'intercal;': '\u22ba',
+ 'Intersection;': '\u22c2',
+ 'intlarhk;': '\u2a17',
+ 'intprod;': '\u2a3c',
+ 'InvisibleComma;': '\u2063',
+ 'InvisibleTimes;': '\u2062',
+ 'IOcy;': '\u0401',
+ 'iocy;': '\u0451',
+ 'Iogon;': '\u012e',
+ 'iogon;': '\u012f',
+ 'Iopf;': '\U0001d540',
+ 'iopf;': '\U0001d55a',
+ 'Iota;': '\u0399',
+ 'iota;': '\u03b9',
+ 'iprod;': '\u2a3c',
+ 'iquest': '\xbf',
+ 'iquest;': '\xbf',
+ 'Iscr;': '\u2110',
+ 'iscr;': '\U0001d4be',
+ 'isin;': '\u2208',
+ 'isindot;': '\u22f5',
+ 'isinE;': '\u22f9',
+ 'isins;': '\u22f4',
+ 'isinsv;': '\u22f3',
+ 'isinv;': '\u2208',
+ 'it;': '\u2062',
+ 'Itilde;': '\u0128',
+ 'itilde;': '\u0129',
+ 'Iukcy;': '\u0406',
+ 'iukcy;': '\u0456',
+ 'Iuml': '\xcf',
+ 'iuml': '\xef',
+ 'Iuml;': '\xcf',
+ 'iuml;': '\xef',
+ 'Jcirc;': '\u0134',
+ 'jcirc;': '\u0135',
+ 'Jcy;': '\u0419',
+ 'jcy;': '\u0439',
+ 'Jfr;': '\U0001d50d',
+ 'jfr;': '\U0001d527',
+ 'jmath;': '\u0237',
+ 'Jopf;': '\U0001d541',
+ 'jopf;': '\U0001d55b',
+ 'Jscr;': '\U0001d4a5',
+ 'jscr;': '\U0001d4bf',
+ 'Jsercy;': '\u0408',
+ 'jsercy;': '\u0458',
+ 'Jukcy;': '\u0404',
+ 'jukcy;': '\u0454',
+ 'Kappa;': '\u039a',
+ 'kappa;': '\u03ba',
+ 'kappav;': '\u03f0',
+ 'Kcedil;': '\u0136',
+ 'kcedil;': '\u0137',
+ 'Kcy;': '\u041a',
+ 'kcy;': '\u043a',
+ 'Kfr;': '\U0001d50e',
+ 'kfr;': '\U0001d528',
+ 'kgreen;': '\u0138',
+ 'KHcy;': '\u0425',
+ 'khcy;': '\u0445',
+ 'KJcy;': '\u040c',
+ 'kjcy;': '\u045c',
+ 'Kopf;': '\U0001d542',
+ 'kopf;': '\U0001d55c',
+ 'Kscr;': '\U0001d4a6',
+ 'kscr;': '\U0001d4c0',
+ 'lAarr;': '\u21da',
+ 'Lacute;': '\u0139',
+ 'lacute;': '\u013a',
+ 'laemptyv;': '\u29b4',
+ 'lagran;': '\u2112',
+ 'Lambda;': '\u039b',
+ 'lambda;': '\u03bb',
+ 'Lang;': '\u27ea',
+ 'lang;': '\u27e8',
+ 'langd;': '\u2991',
+ 'langle;': '\u27e8',
+ 'lap;': '\u2a85',
+ 'Laplacetrf;': '\u2112',
+ 'laquo': '\xab',
+ 'laquo;': '\xab',
+ 'Larr;': '\u219e',
+ 'lArr;': '\u21d0',
+ 'larr;': '\u2190',
+ 'larrb;': '\u21e4',
+ 'larrbfs;': '\u291f',
+ 'larrfs;': '\u291d',
+ 'larrhk;': '\u21a9',
+ 'larrlp;': '\u21ab',
+ 'larrpl;': '\u2939',
+ 'larrsim;': '\u2973',
+ 'larrtl;': '\u21a2',
+ 'lat;': '\u2aab',
+ 'lAtail;': '\u291b',
+ 'latail;': '\u2919',
+ 'late;': '\u2aad',
+ 'lates;': '\u2aad\ufe00',
+ 'lBarr;': '\u290e',
+ 'lbarr;': '\u290c',
+ 'lbbrk;': '\u2772',
+ 'lbrace;': '{',
+ 'lbrack;': '[',
+ 'lbrke;': '\u298b',
+ 'lbrksld;': '\u298f',
+ 'lbrkslu;': '\u298d',
+ 'Lcaron;': '\u013d',
+ 'lcaron;': '\u013e',
+ 'Lcedil;': '\u013b',
+ 'lcedil;': '\u013c',
+ 'lceil;': '\u2308',
+ 'lcub;': '{',
+ 'Lcy;': '\u041b',
+ 'lcy;': '\u043b',
+ 'ldca;': '\u2936',
+ 'ldquo;': '\u201c',
+ 'ldquor;': '\u201e',
+ 'ldrdhar;': '\u2967',
+ 'ldrushar;': '\u294b',
+ 'ldsh;': '\u21b2',
+ 'lE;': '\u2266',
+ 'le;': '\u2264',
+ 'LeftAngleBracket;': '\u27e8',
+ 'LeftArrow;': '\u2190',
+ 'Leftarrow;': '\u21d0',
+ 'leftarrow;': '\u2190',
+ 'LeftArrowBar;': '\u21e4',
+ 'LeftArrowRightArrow;': '\u21c6',
+ 'leftarrowtail;': '\u21a2',
+ 'LeftCeiling;': '\u2308',
+ 'LeftDoubleBracket;': '\u27e6',
+ 'LeftDownTeeVector;': '\u2961',
+ 'LeftDownVector;': '\u21c3',
+ 'LeftDownVectorBar;': '\u2959',
+ 'LeftFloor;': '\u230a',
+ 'leftharpoondown;': '\u21bd',
+ 'leftharpoonup;': '\u21bc',
+ 'leftleftarrows;': '\u21c7',
+ 'LeftRightArrow;': '\u2194',
+ 'Leftrightarrow;': '\u21d4',
+ 'leftrightarrow;': '\u2194',
+ 'leftrightarrows;': '\u21c6',
+ 'leftrightharpoons;': '\u21cb',
+ 'leftrightsquigarrow;': '\u21ad',
+ 'LeftRightVector;': '\u294e',
+ 'LeftTee;': '\u22a3',
+ 'LeftTeeArrow;': '\u21a4',
+ 'LeftTeeVector;': '\u295a',
+ 'leftthreetimes;': '\u22cb',
+ 'LeftTriangle;': '\u22b2',
+ 'LeftTriangleBar;': '\u29cf',
+ 'LeftTriangleEqual;': '\u22b4',
+ 'LeftUpDownVector;': '\u2951',
+ 'LeftUpTeeVector;': '\u2960',
+ 'LeftUpVector;': '\u21bf',
+ 'LeftUpVectorBar;': '\u2958',
+ 'LeftVector;': '\u21bc',
+ 'LeftVectorBar;': '\u2952',
+ 'lEg;': '\u2a8b',
+ 'leg;': '\u22da',
+ 'leq;': '\u2264',
+ 'leqq;': '\u2266',
+ 'leqslant;': '\u2a7d',
+ 'les;': '\u2a7d',
+ 'lescc;': '\u2aa8',
+ 'lesdot;': '\u2a7f',
+ 'lesdoto;': '\u2a81',
+ 'lesdotor;': '\u2a83',
+ 'lesg;': '\u22da\ufe00',
+ 'lesges;': '\u2a93',
+ 'lessapprox;': '\u2a85',
+ 'lessdot;': '\u22d6',
+ 'lesseqgtr;': '\u22da',
+ 'lesseqqgtr;': '\u2a8b',
+ 'LessEqualGreater;': '\u22da',
+ 'LessFullEqual;': '\u2266',
+ 'LessGreater;': '\u2276',
+ 'lessgtr;': '\u2276',
+ 'LessLess;': '\u2aa1',
+ 'lesssim;': '\u2272',
+ 'LessSlantEqual;': '\u2a7d',
+ 'LessTilde;': '\u2272',
+ 'lfisht;': '\u297c',
+ 'lfloor;': '\u230a',
+ 'Lfr;': '\U0001d50f',
+ 'lfr;': '\U0001d529',
+ 'lg;': '\u2276',
+ 'lgE;': '\u2a91',
+ 'lHar;': '\u2962',
+ 'lhard;': '\u21bd',
+ 'lharu;': '\u21bc',
+ 'lharul;': '\u296a',
+ 'lhblk;': '\u2584',
+ 'LJcy;': '\u0409',
+ 'ljcy;': '\u0459',
+ 'Ll;': '\u22d8',
+ 'll;': '\u226a',
+ 'llarr;': '\u21c7',
+ 'llcorner;': '\u231e',
+ 'Lleftarrow;': '\u21da',
+ 'llhard;': '\u296b',
+ 'lltri;': '\u25fa',
+ 'Lmidot;': '\u013f',
+ 'lmidot;': '\u0140',
+ 'lmoust;': '\u23b0',
+ 'lmoustache;': '\u23b0',
+ 'lnap;': '\u2a89',
+ 'lnapprox;': '\u2a89',
+ 'lnE;': '\u2268',
+ 'lne;': '\u2a87',
+ 'lneq;': '\u2a87',
+ 'lneqq;': '\u2268',
+ 'lnsim;': '\u22e6',
+ 'loang;': '\u27ec',
+ 'loarr;': '\u21fd',
+ 'lobrk;': '\u27e6',
+ 'LongLeftArrow;': '\u27f5',
+ 'Longleftarrow;': '\u27f8',
+ 'longleftarrow;': '\u27f5',
+ 'LongLeftRightArrow;': '\u27f7',
+ 'Longleftrightarrow;': '\u27fa',
+ 'longleftrightarrow;': '\u27f7',
+ 'longmapsto;': '\u27fc',
+ 'LongRightArrow;': '\u27f6',
+ 'Longrightarrow;': '\u27f9',
+ 'longrightarrow;': '\u27f6',
+ 'looparrowleft;': '\u21ab',
+ 'looparrowright;': '\u21ac',
+ 'lopar;': '\u2985',
+ 'Lopf;': '\U0001d543',
+ 'lopf;': '\U0001d55d',
+ 'loplus;': '\u2a2d',
+ 'lotimes;': '\u2a34',
+ 'lowast;': '\u2217',
+ 'lowbar;': '_',
+ 'LowerLeftArrow;': '\u2199',
+ 'LowerRightArrow;': '\u2198',
+ 'loz;': '\u25ca',
+ 'lozenge;': '\u25ca',
+ 'lozf;': '\u29eb',
+ 'lpar;': '(',
+ 'lparlt;': '\u2993',
+ 'lrarr;': '\u21c6',
+ 'lrcorner;': '\u231f',
+ 'lrhar;': '\u21cb',
+ 'lrhard;': '\u296d',
+ 'lrm;': '\u200e',
+ 'lrtri;': '\u22bf',
+ 'lsaquo;': '\u2039',
+ 'Lscr;': '\u2112',
+ 'lscr;': '\U0001d4c1',
+ 'Lsh;': '\u21b0',
+ 'lsh;': '\u21b0',
+ 'lsim;': '\u2272',
+ 'lsime;': '\u2a8d',
+ 'lsimg;': '\u2a8f',
+ 'lsqb;': '[',
+ 'lsquo;': '\u2018',
+ 'lsquor;': '\u201a',
+ 'Lstrok;': '\u0141',
+ 'lstrok;': '\u0142',
+ 'LT': '<',
+ 'lt': '<',
+ 'LT;': '<',
+ 'Lt;': '\u226a',
+ 'lt;': '<',
+ 'ltcc;': '\u2aa6',
+ 'ltcir;': '\u2a79',
+ 'ltdot;': '\u22d6',
+ 'lthree;': '\u22cb',
+ 'ltimes;': '\u22c9',
+ 'ltlarr;': '\u2976',
+ 'ltquest;': '\u2a7b',
+ 'ltri;': '\u25c3',
+ 'ltrie;': '\u22b4',
+ 'ltrif;': '\u25c2',
+ 'ltrPar;': '\u2996',
+ 'lurdshar;': '\u294a',
+ 'luruhar;': '\u2966',
+ 'lvertneqq;': '\u2268\ufe00',
+ 'lvnE;': '\u2268\ufe00',
+ 'macr': '\xaf',
+ 'macr;': '\xaf',
+ 'male;': '\u2642',
+ 'malt;': '\u2720',
+ 'maltese;': '\u2720',
+ 'Map;': '\u2905',
+ 'map;': '\u21a6',
+ 'mapsto;': '\u21a6',
+ 'mapstodown;': '\u21a7',
+ 'mapstoleft;': '\u21a4',
+ 'mapstoup;': '\u21a5',
+ 'marker;': '\u25ae',
+ 'mcomma;': '\u2a29',
+ 'Mcy;': '\u041c',
+ 'mcy;': '\u043c',
+ 'mdash;': '\u2014',
+ 'mDDot;': '\u223a',
+ 'measuredangle;': '\u2221',
+ 'MediumSpace;': '\u205f',
+ 'Mellintrf;': '\u2133',
+ 'Mfr;': '\U0001d510',
+ 'mfr;': '\U0001d52a',
+ 'mho;': '\u2127',
+ 'micro': '\xb5',
+ 'micro;': '\xb5',
+ 'mid;': '\u2223',
+ 'midast;': '*',
+ 'midcir;': '\u2af0',
+ 'middot': '\xb7',
+ 'middot;': '\xb7',
+ 'minus;': '\u2212',
+ 'minusb;': '\u229f',
+ 'minusd;': '\u2238',
+ 'minusdu;': '\u2a2a',
+ 'MinusPlus;': '\u2213',
+ 'mlcp;': '\u2adb',
+ 'mldr;': '\u2026',
+ 'mnplus;': '\u2213',
+ 'models;': '\u22a7',
+ 'Mopf;': '\U0001d544',
+ 'mopf;': '\U0001d55e',
+ 'mp;': '\u2213',
+ 'Mscr;': '\u2133',
+ 'mscr;': '\U0001d4c2',
+ 'mstpos;': '\u223e',
+ 'Mu;': '\u039c',
+ 'mu;': '\u03bc',
+ 'multimap;': '\u22b8',
+ 'mumap;': '\u22b8',
+ 'nabla;': '\u2207',
+ 'Nacute;': '\u0143',
+ 'nacute;': '\u0144',
+ 'nang;': '\u2220\u20d2',
+ 'nap;': '\u2249',
+ 'napE;': '\u2a70\u0338',
+ 'napid;': '\u224b\u0338',
+ 'napos;': '\u0149',
+ 'napprox;': '\u2249',
+ 'natur;': '\u266e',
+ 'natural;': '\u266e',
+ 'naturals;': '\u2115',
+ 'nbsp': '\xa0',
+ 'nbsp;': '\xa0',
+ 'nbump;': '\u224e\u0338',
+ 'nbumpe;': '\u224f\u0338',
+ 'ncap;': '\u2a43',
+ 'Ncaron;': '\u0147',
+ 'ncaron;': '\u0148',
+ 'Ncedil;': '\u0145',
+ 'ncedil;': '\u0146',
+ 'ncong;': '\u2247',
+ 'ncongdot;': '\u2a6d\u0338',
+ 'ncup;': '\u2a42',
+ 'Ncy;': '\u041d',
+ 'ncy;': '\u043d',
+ 'ndash;': '\u2013',
+ 'ne;': '\u2260',
+ 'nearhk;': '\u2924',
+ 'neArr;': '\u21d7',
+ 'nearr;': '\u2197',
+ 'nearrow;': '\u2197',
+ 'nedot;': '\u2250\u0338',
+ 'NegativeMediumSpace;': '\u200b',
+ 'NegativeThickSpace;': '\u200b',
+ 'NegativeThinSpace;': '\u200b',
+ 'NegativeVeryThinSpace;': '\u200b',
+ 'nequiv;': '\u2262',
+ 'nesear;': '\u2928',
+ 'nesim;': '\u2242\u0338',
+ 'NestedGreaterGreater;': '\u226b',
+ 'NestedLessLess;': '\u226a',
+ 'NewLine;': '\n',
+ 'nexist;': '\u2204',
+ 'nexists;': '\u2204',
+ 'Nfr;': '\U0001d511',
+ 'nfr;': '\U0001d52b',
+ 'ngE;': '\u2267\u0338',
+ 'nge;': '\u2271',
+ 'ngeq;': '\u2271',
+ 'ngeqq;': '\u2267\u0338',
+ 'ngeqslant;': '\u2a7e\u0338',
+ 'nges;': '\u2a7e\u0338',
+ 'nGg;': '\u22d9\u0338',
+ 'ngsim;': '\u2275',
+ 'nGt;': '\u226b\u20d2',
+ 'ngt;': '\u226f',
+ 'ngtr;': '\u226f',
+ 'nGtv;': '\u226b\u0338',
+ 'nhArr;': '\u21ce',
+ 'nharr;': '\u21ae',
+ 'nhpar;': '\u2af2',
+ 'ni;': '\u220b',
+ 'nis;': '\u22fc',
+ 'nisd;': '\u22fa',
+ 'niv;': '\u220b',
+ 'NJcy;': '\u040a',
+ 'njcy;': '\u045a',
+ 'nlArr;': '\u21cd',
+ 'nlarr;': '\u219a',
+ 'nldr;': '\u2025',
+ 'nlE;': '\u2266\u0338',
+ 'nle;': '\u2270',
+ 'nLeftarrow;': '\u21cd',
+ 'nleftarrow;': '\u219a',
+ 'nLeftrightarrow;': '\u21ce',
+ 'nleftrightarrow;': '\u21ae',
+ 'nleq;': '\u2270',
+ 'nleqq;': '\u2266\u0338',
+ 'nleqslant;': '\u2a7d\u0338',
+ 'nles;': '\u2a7d\u0338',
+ 'nless;': '\u226e',
+ 'nLl;': '\u22d8\u0338',
+ 'nlsim;': '\u2274',
+ 'nLt;': '\u226a\u20d2',
+ 'nlt;': '\u226e',
+ 'nltri;': '\u22ea',
+ 'nltrie;': '\u22ec',
+ 'nLtv;': '\u226a\u0338',
+ 'nmid;': '\u2224',
+ 'NoBreak;': '\u2060',
+ 'NonBreakingSpace;': '\xa0',
+ 'Nopf;': '\u2115',
+ 'nopf;': '\U0001d55f',
+ 'not': '\xac',
+ 'Not;': '\u2aec',
+ 'not;': '\xac',
+ 'NotCongruent;': '\u2262',
+ 'NotCupCap;': '\u226d',
+ 'NotDoubleVerticalBar;': '\u2226',
+ 'NotElement;': '\u2209',
+ 'NotEqual;': '\u2260',
+ 'NotEqualTilde;': '\u2242\u0338',
+ 'NotExists;': '\u2204',
+ 'NotGreater;': '\u226f',
+ 'NotGreaterEqual;': '\u2271',
+ 'NotGreaterFullEqual;': '\u2267\u0338',
+ 'NotGreaterGreater;': '\u226b\u0338',
+ 'NotGreaterLess;': '\u2279',
+ 'NotGreaterSlantEqual;': '\u2a7e\u0338',
+ 'NotGreaterTilde;': '\u2275',
+ 'NotHumpDownHump;': '\u224e\u0338',
+ 'NotHumpEqual;': '\u224f\u0338',
+ 'notin;': '\u2209',
+ 'notindot;': '\u22f5\u0338',
+ 'notinE;': '\u22f9\u0338',
+ 'notinva;': '\u2209',
+ 'notinvb;': '\u22f7',
+ 'notinvc;': '\u22f6',
+ 'NotLeftTriangle;': '\u22ea',
+ 'NotLeftTriangleBar;': '\u29cf\u0338',
+ 'NotLeftTriangleEqual;': '\u22ec',
+ 'NotLess;': '\u226e',
+ 'NotLessEqual;': '\u2270',
+ 'NotLessGreater;': '\u2278',
+ 'NotLessLess;': '\u226a\u0338',
+ 'NotLessSlantEqual;': '\u2a7d\u0338',
+ 'NotLessTilde;': '\u2274',
+ 'NotNestedGreaterGreater;': '\u2aa2\u0338',
+ 'NotNestedLessLess;': '\u2aa1\u0338',
+ 'notni;': '\u220c',
+ 'notniva;': '\u220c',
+ 'notnivb;': '\u22fe',
+ 'notnivc;': '\u22fd',
+ 'NotPrecedes;': '\u2280',
+ 'NotPrecedesEqual;': '\u2aaf\u0338',
+ 'NotPrecedesSlantEqual;': '\u22e0',
+ 'NotReverseElement;': '\u220c',
+ 'NotRightTriangle;': '\u22eb',
+ 'NotRightTriangleBar;': '\u29d0\u0338',
+ 'NotRightTriangleEqual;': '\u22ed',
+ 'NotSquareSubset;': '\u228f\u0338',
+ 'NotSquareSubsetEqual;': '\u22e2',
+ 'NotSquareSuperset;': '\u2290\u0338',
+ 'NotSquareSupersetEqual;': '\u22e3',
+ 'NotSubset;': '\u2282\u20d2',
+ 'NotSubsetEqual;': '\u2288',
+ 'NotSucceeds;': '\u2281',
+ 'NotSucceedsEqual;': '\u2ab0\u0338',
+ 'NotSucceedsSlantEqual;': '\u22e1',
+ 'NotSucceedsTilde;': '\u227f\u0338',
+ 'NotSuperset;': '\u2283\u20d2',
+ 'NotSupersetEqual;': '\u2289',
+ 'NotTilde;': '\u2241',
+ 'NotTildeEqual;': '\u2244',
+ 'NotTildeFullEqual;': '\u2247',
+ 'NotTildeTilde;': '\u2249',
+ 'NotVerticalBar;': '\u2224',
+ 'npar;': '\u2226',
+ 'nparallel;': '\u2226',
+ 'nparsl;': '\u2afd\u20e5',
+ 'npart;': '\u2202\u0338',
+ 'npolint;': '\u2a14',
+ 'npr;': '\u2280',
+ 'nprcue;': '\u22e0',
+ 'npre;': '\u2aaf\u0338',
+ 'nprec;': '\u2280',
+ 'npreceq;': '\u2aaf\u0338',
+ 'nrArr;': '\u21cf',
+ 'nrarr;': '\u219b',
+ 'nrarrc;': '\u2933\u0338',
+ 'nrarrw;': '\u219d\u0338',
+ 'nRightarrow;': '\u21cf',
+ 'nrightarrow;': '\u219b',
+ 'nrtri;': '\u22eb',
+ 'nrtrie;': '\u22ed',
+ 'nsc;': '\u2281',
+ 'nsccue;': '\u22e1',
+ 'nsce;': '\u2ab0\u0338',
+ 'Nscr;': '\U0001d4a9',
+ 'nscr;': '\U0001d4c3',
+ 'nshortmid;': '\u2224',
+ 'nshortparallel;': '\u2226',
+ 'nsim;': '\u2241',
+ 'nsime;': '\u2244',
+ 'nsimeq;': '\u2244',
+ 'nsmid;': '\u2224',
+ 'nspar;': '\u2226',
+ 'nsqsube;': '\u22e2',
+ 'nsqsupe;': '\u22e3',
+ 'nsub;': '\u2284',
+ 'nsubE;': '\u2ac5\u0338',
+ 'nsube;': '\u2288',
+ 'nsubset;': '\u2282\u20d2',
+ 'nsubseteq;': '\u2288',
+ 'nsubseteqq;': '\u2ac5\u0338',
+ 'nsucc;': '\u2281',
+ 'nsucceq;': '\u2ab0\u0338',
+ 'nsup;': '\u2285',
+ 'nsupE;': '\u2ac6\u0338',
+ 'nsupe;': '\u2289',
+ 'nsupset;': '\u2283\u20d2',
+ 'nsupseteq;': '\u2289',
+ 'nsupseteqq;': '\u2ac6\u0338',
+ 'ntgl;': '\u2279',
+ 'Ntilde': '\xd1',
+ 'ntilde': '\xf1',
+ 'Ntilde;': '\xd1',
+ 'ntilde;': '\xf1',
+ 'ntlg;': '\u2278',
+ 'ntriangleleft;': '\u22ea',
+ 'ntrianglelefteq;': '\u22ec',
+ 'ntriangleright;': '\u22eb',
+ 'ntrianglerighteq;': '\u22ed',
+ 'Nu;': '\u039d',
+ 'nu;': '\u03bd',
+ 'num;': '#',
+ 'numero;': '\u2116',
+ 'numsp;': '\u2007',
+ 'nvap;': '\u224d\u20d2',
+ 'nVDash;': '\u22af',
+ 'nVdash;': '\u22ae',
+ 'nvDash;': '\u22ad',
+ 'nvdash;': '\u22ac',
+ 'nvge;': '\u2265\u20d2',
+ 'nvgt;': '>\u20d2',
+ 'nvHarr;': '\u2904',
+ 'nvinfin;': '\u29de',
+ 'nvlArr;': '\u2902',
+ 'nvle;': '\u2264\u20d2',
+ 'nvlt;': '<\u20d2',
+ 'nvltrie;': '\u22b4\u20d2',
+ 'nvrArr;': '\u2903',
+ 'nvrtrie;': '\u22b5\u20d2',
+ 'nvsim;': '\u223c\u20d2',
+ 'nwarhk;': '\u2923',
+ 'nwArr;': '\u21d6',
+ 'nwarr;': '\u2196',
+ 'nwarrow;': '\u2196',
+ 'nwnear;': '\u2927',
+ 'Oacute': '\xd3',
+ 'oacute': '\xf3',
+ 'Oacute;': '\xd3',
+ 'oacute;': '\xf3',
+ 'oast;': '\u229b',
+ 'ocir;': '\u229a',
+ 'Ocirc': '\xd4',
+ 'ocirc': '\xf4',
+ 'Ocirc;': '\xd4',
+ 'ocirc;': '\xf4',
+ 'Ocy;': '\u041e',
+ 'ocy;': '\u043e',
+ 'odash;': '\u229d',
+ 'Odblac;': '\u0150',
+ 'odblac;': '\u0151',
+ 'odiv;': '\u2a38',
+ 'odot;': '\u2299',
+ 'odsold;': '\u29bc',
+ 'OElig;': '\u0152',
+ 'oelig;': '\u0153',
+ 'ofcir;': '\u29bf',
+ 'Ofr;': '\U0001d512',
+ 'ofr;': '\U0001d52c',
+ 'ogon;': '\u02db',
+ 'Ograve': '\xd2',
+ 'ograve': '\xf2',
+ 'Ograve;': '\xd2',
+ 'ograve;': '\xf2',
+ 'ogt;': '\u29c1',
+ 'ohbar;': '\u29b5',
+ 'ohm;': '\u03a9',
+ 'oint;': '\u222e',
+ 'olarr;': '\u21ba',
+ 'olcir;': '\u29be',
+ 'olcross;': '\u29bb',
+ 'oline;': '\u203e',
+ 'olt;': '\u29c0',
+ 'Omacr;': '\u014c',
+ 'omacr;': '\u014d',
+ 'Omega;': '\u03a9',
+ 'omega;': '\u03c9',
+ 'Omicron;': '\u039f',
+ 'omicron;': '\u03bf',
+ 'omid;': '\u29b6',
+ 'ominus;': '\u2296',
+ 'Oopf;': '\U0001d546',
+ 'oopf;': '\U0001d560',
+ 'opar;': '\u29b7',
+ 'OpenCurlyDoubleQuote;': '\u201c',
+ 'OpenCurlyQuote;': '\u2018',
+ 'operp;': '\u29b9',
+ 'oplus;': '\u2295',
+ 'Or;': '\u2a54',
+ 'or;': '\u2228',
+ 'orarr;': '\u21bb',
+ 'ord;': '\u2a5d',
+ 'order;': '\u2134',
+ 'orderof;': '\u2134',
+ 'ordf': '\xaa',
+ 'ordf;': '\xaa',
+ 'ordm': '\xba',
+ 'ordm;': '\xba',
+ 'origof;': '\u22b6',
+ 'oror;': '\u2a56',
+ 'orslope;': '\u2a57',
+ 'orv;': '\u2a5b',
+ 'oS;': '\u24c8',
+ 'Oscr;': '\U0001d4aa',
+ 'oscr;': '\u2134',
+ 'Oslash': '\xd8',
+ 'oslash': '\xf8',
+ 'Oslash;': '\xd8',
+ 'oslash;': '\xf8',
+ 'osol;': '\u2298',
+ 'Otilde': '\xd5',
+ 'otilde': '\xf5',
+ 'Otilde;': '\xd5',
+ 'otilde;': '\xf5',
+ 'Otimes;': '\u2a37',
+ 'otimes;': '\u2297',
+ 'otimesas;': '\u2a36',
+ 'Ouml': '\xd6',
+ 'ouml': '\xf6',
+ 'Ouml;': '\xd6',
+ 'ouml;': '\xf6',
+ 'ovbar;': '\u233d',
+ 'OverBar;': '\u203e',
+ 'OverBrace;': '\u23de',
+ 'OverBracket;': '\u23b4',
+ 'OverParenthesis;': '\u23dc',
+ 'par;': '\u2225',
+ 'para': '\xb6',
+ 'para;': '\xb6',
+ 'parallel;': '\u2225',
+ 'parsim;': '\u2af3',
+ 'parsl;': '\u2afd',
+ 'part;': '\u2202',
+ 'PartialD;': '\u2202',
+ 'Pcy;': '\u041f',
+ 'pcy;': '\u043f',
+ 'percnt;': '%',
+ 'period;': '.',
+ 'permil;': '\u2030',
+ 'perp;': '\u22a5',
+ 'pertenk;': '\u2031',
+ 'Pfr;': '\U0001d513',
+ 'pfr;': '\U0001d52d',
+ 'Phi;': '\u03a6',
+ 'phi;': '\u03c6',
+ 'phiv;': '\u03d5',
+ 'phmmat;': '\u2133',
+ 'phone;': '\u260e',
+ 'Pi;': '\u03a0',
+ 'pi;': '\u03c0',
+ 'pitchfork;': '\u22d4',
+ 'piv;': '\u03d6',
+ 'planck;': '\u210f',
+ 'planckh;': '\u210e',
+ 'plankv;': '\u210f',
+ 'plus;': '+',
+ 'plusacir;': '\u2a23',
+ 'plusb;': '\u229e',
+ 'pluscir;': '\u2a22',
+ 'plusdo;': '\u2214',
+ 'plusdu;': '\u2a25',
+ 'pluse;': '\u2a72',
+ 'PlusMinus;': '\xb1',
+ 'plusmn': '\xb1',
+ 'plusmn;': '\xb1',
+ 'plussim;': '\u2a26',
+ 'plustwo;': '\u2a27',
+ 'pm;': '\xb1',
+ 'Poincareplane;': '\u210c',
+ 'pointint;': '\u2a15',
+ 'Popf;': '\u2119',
+ 'popf;': '\U0001d561',
+ 'pound': '\xa3',
+ 'pound;': '\xa3',
+ 'Pr;': '\u2abb',
+ 'pr;': '\u227a',
+ 'prap;': '\u2ab7',
+ 'prcue;': '\u227c',
+ 'prE;': '\u2ab3',
+ 'pre;': '\u2aaf',
+ 'prec;': '\u227a',
+ 'precapprox;': '\u2ab7',
+ 'preccurlyeq;': '\u227c',
+ 'Precedes;': '\u227a',
+ 'PrecedesEqual;': '\u2aaf',
+ 'PrecedesSlantEqual;': '\u227c',
+ 'PrecedesTilde;': '\u227e',
+ 'preceq;': '\u2aaf',
+ 'precnapprox;': '\u2ab9',
+ 'precneqq;': '\u2ab5',
+ 'precnsim;': '\u22e8',
+ 'precsim;': '\u227e',
+ 'Prime;': '\u2033',
+ 'prime;': '\u2032',
+ 'primes;': '\u2119',
+ 'prnap;': '\u2ab9',
+ 'prnE;': '\u2ab5',
+ 'prnsim;': '\u22e8',
+ 'prod;': '\u220f',
+ 'Product;': '\u220f',
+ 'profalar;': '\u232e',
+ 'profline;': '\u2312',
+ 'profsurf;': '\u2313',
+ 'prop;': '\u221d',
+ 'Proportion;': '\u2237',
+ 'Proportional;': '\u221d',
+ 'propto;': '\u221d',
+ 'prsim;': '\u227e',
+ 'prurel;': '\u22b0',
+ 'Pscr;': '\U0001d4ab',
+ 'pscr;': '\U0001d4c5',
+ 'Psi;': '\u03a8',
+ 'psi;': '\u03c8',
+ 'puncsp;': '\u2008',
+ 'Qfr;': '\U0001d514',
+ 'qfr;': '\U0001d52e',
+ 'qint;': '\u2a0c',
+ 'Qopf;': '\u211a',
+ 'qopf;': '\U0001d562',
+ 'qprime;': '\u2057',
+ 'Qscr;': '\U0001d4ac',
+ 'qscr;': '\U0001d4c6',
+ 'quaternions;': '\u210d',
+ 'quatint;': '\u2a16',
+ 'quest;': '?',
+ 'questeq;': '\u225f',
+ 'QUOT': '"',
+ 'quot': '"',
+ 'QUOT;': '"',
+ 'quot;': '"',
+ 'rAarr;': '\u21db',
+ 'race;': '\u223d\u0331',
+ 'Racute;': '\u0154',
+ 'racute;': '\u0155',
+ 'radic;': '\u221a',
+ 'raemptyv;': '\u29b3',
+ 'Rang;': '\u27eb',
+ 'rang;': '\u27e9',
+ 'rangd;': '\u2992',
+ 'range;': '\u29a5',
+ 'rangle;': '\u27e9',
+ 'raquo': '\xbb',
+ 'raquo;': '\xbb',
+ 'Rarr;': '\u21a0',
+ 'rArr;': '\u21d2',
+ 'rarr;': '\u2192',
+ 'rarrap;': '\u2975',
+ 'rarrb;': '\u21e5',
+ 'rarrbfs;': '\u2920',
+ 'rarrc;': '\u2933',
+ 'rarrfs;': '\u291e',
+ 'rarrhk;': '\u21aa',
+ 'rarrlp;': '\u21ac',
+ 'rarrpl;': '\u2945',
+ 'rarrsim;': '\u2974',
+ 'Rarrtl;': '\u2916',
+ 'rarrtl;': '\u21a3',
+ 'rarrw;': '\u219d',
+ 'rAtail;': '\u291c',
+ 'ratail;': '\u291a',
+ 'ratio;': '\u2236',
+ 'rationals;': '\u211a',
+ 'RBarr;': '\u2910',
+ 'rBarr;': '\u290f',
+ 'rbarr;': '\u290d',
+ 'rbbrk;': '\u2773',
+ 'rbrace;': '}',
+ 'rbrack;': ']',
+ 'rbrke;': '\u298c',
+ 'rbrksld;': '\u298e',
+ 'rbrkslu;': '\u2990',
+ 'Rcaron;': '\u0158',
+ 'rcaron;': '\u0159',
+ 'Rcedil;': '\u0156',
+ 'rcedil;': '\u0157',
+ 'rceil;': '\u2309',
+ 'rcub;': '}',
+ 'Rcy;': '\u0420',
+ 'rcy;': '\u0440',
+ 'rdca;': '\u2937',
+ 'rdldhar;': '\u2969',
+ 'rdquo;': '\u201d',
+ 'rdquor;': '\u201d',
+ 'rdsh;': '\u21b3',
+ 'Re;': '\u211c',
+ 'real;': '\u211c',
+ 'realine;': '\u211b',
+ 'realpart;': '\u211c',
+ 'reals;': '\u211d',
+ 'rect;': '\u25ad',
+ 'REG': '\xae',
+ 'reg': '\xae',
+ 'REG;': '\xae',
+ 'reg;': '\xae',
+ 'ReverseElement;': '\u220b',
+ 'ReverseEquilibrium;': '\u21cb',
+ 'ReverseUpEquilibrium;': '\u296f',
+ 'rfisht;': '\u297d',
+ 'rfloor;': '\u230b',
+ 'Rfr;': '\u211c',
+ 'rfr;': '\U0001d52f',
+ 'rHar;': '\u2964',
+ 'rhard;': '\u21c1',
+ 'rharu;': '\u21c0',
+ 'rharul;': '\u296c',
+ 'Rho;': '\u03a1',
+ 'rho;': '\u03c1',
+ 'rhov;': '\u03f1',
+ 'RightAngleBracket;': '\u27e9',
+ 'RightArrow;': '\u2192',
+ 'Rightarrow;': '\u21d2',
+ 'rightarrow;': '\u2192',
+ 'RightArrowBar;': '\u21e5',
+ 'RightArrowLeftArrow;': '\u21c4',
+ 'rightarrowtail;': '\u21a3',
+ 'RightCeiling;': '\u2309',
+ 'RightDoubleBracket;': '\u27e7',
+ 'RightDownTeeVector;': '\u295d',
+ 'RightDownVector;': '\u21c2',
+ 'RightDownVectorBar;': '\u2955',
+ 'RightFloor;': '\u230b',
+ 'rightharpoondown;': '\u21c1',
+ 'rightharpoonup;': '\u21c0',
+ 'rightleftarrows;': '\u21c4',
+ 'rightleftharpoons;': '\u21cc',
+ 'rightrightarrows;': '\u21c9',
+ 'rightsquigarrow;': '\u219d',
+ 'RightTee;': '\u22a2',
+ 'RightTeeArrow;': '\u21a6',
+ 'RightTeeVector;': '\u295b',
+ 'rightthreetimes;': '\u22cc',
+ 'RightTriangle;': '\u22b3',
+ 'RightTriangleBar;': '\u29d0',
+ 'RightTriangleEqual;': '\u22b5',
+ 'RightUpDownVector;': '\u294f',
+ 'RightUpTeeVector;': '\u295c',
+ 'RightUpVector;': '\u21be',
+ 'RightUpVectorBar;': '\u2954',
+ 'RightVector;': '\u21c0',
+ 'RightVectorBar;': '\u2953',
+ 'ring;': '\u02da',
+ 'risingdotseq;': '\u2253',
+ 'rlarr;': '\u21c4',
+ 'rlhar;': '\u21cc',
+ 'rlm;': '\u200f',
+ 'rmoust;': '\u23b1',
+ 'rmoustache;': '\u23b1',
+ 'rnmid;': '\u2aee',
+ 'roang;': '\u27ed',
+ 'roarr;': '\u21fe',
+ 'robrk;': '\u27e7',
+ 'ropar;': '\u2986',
+ 'Ropf;': '\u211d',
+ 'ropf;': '\U0001d563',
+ 'roplus;': '\u2a2e',
+ 'rotimes;': '\u2a35',
+ 'RoundImplies;': '\u2970',
+ 'rpar;': ')',
+ 'rpargt;': '\u2994',
+ 'rppolint;': '\u2a12',
+ 'rrarr;': '\u21c9',
+ 'Rrightarrow;': '\u21db',
+ 'rsaquo;': '\u203a',
+ 'Rscr;': '\u211b',
+ 'rscr;': '\U0001d4c7',
+ 'Rsh;': '\u21b1',
+ 'rsh;': '\u21b1',
+ 'rsqb;': ']',
+ 'rsquo;': '\u2019',
+ 'rsquor;': '\u2019',
+ 'rthree;': '\u22cc',
+ 'rtimes;': '\u22ca',
+ 'rtri;': '\u25b9',
+ 'rtrie;': '\u22b5',
+ 'rtrif;': '\u25b8',
+ 'rtriltri;': '\u29ce',
+ 'RuleDelayed;': '\u29f4',
+ 'ruluhar;': '\u2968',
+ 'rx;': '\u211e',
+ 'Sacute;': '\u015a',
+ 'sacute;': '\u015b',
+ 'sbquo;': '\u201a',
+ 'Sc;': '\u2abc',
+ 'sc;': '\u227b',
+ 'scap;': '\u2ab8',
+ 'Scaron;': '\u0160',
+ 'scaron;': '\u0161',
+ 'sccue;': '\u227d',
+ 'scE;': '\u2ab4',
+ 'sce;': '\u2ab0',
+ 'Scedil;': '\u015e',
+ 'scedil;': '\u015f',
+ 'Scirc;': '\u015c',
+ 'scirc;': '\u015d',
+ 'scnap;': '\u2aba',
+ 'scnE;': '\u2ab6',
+ 'scnsim;': '\u22e9',
+ 'scpolint;': '\u2a13',
+ 'scsim;': '\u227f',
+ 'Scy;': '\u0421',
+ 'scy;': '\u0441',
+ 'sdot;': '\u22c5',
+ 'sdotb;': '\u22a1',
+ 'sdote;': '\u2a66',
+ 'searhk;': '\u2925',
+ 'seArr;': '\u21d8',
+ 'searr;': '\u2198',
+ 'searrow;': '\u2198',
+ 'sect': '\xa7',
+ 'sect;': '\xa7',
+ 'semi;': ';',
+ 'seswar;': '\u2929',
+ 'setminus;': '\u2216',
+ 'setmn;': '\u2216',
+ 'sext;': '\u2736',
+ 'Sfr;': '\U0001d516',
+ 'sfr;': '\U0001d530',
+ 'sfrown;': '\u2322',
+ 'sharp;': '\u266f',
+ 'SHCHcy;': '\u0429',
+ 'shchcy;': '\u0449',
+ 'SHcy;': '\u0428',
+ 'shcy;': '\u0448',
+ 'ShortDownArrow;': '\u2193',
+ 'ShortLeftArrow;': '\u2190',
+ 'shortmid;': '\u2223',
+ 'shortparallel;': '\u2225',
+ 'ShortRightArrow;': '\u2192',
+ 'ShortUpArrow;': '\u2191',
+ 'shy': '\xad',
+ 'shy;': '\xad',
+ 'Sigma;': '\u03a3',
+ 'sigma;': '\u03c3',
+ 'sigmaf;': '\u03c2',
+ 'sigmav;': '\u03c2',
+ 'sim;': '\u223c',
+ 'simdot;': '\u2a6a',
+ 'sime;': '\u2243',
+ 'simeq;': '\u2243',
+ 'simg;': '\u2a9e',
+ 'simgE;': '\u2aa0',
+ 'siml;': '\u2a9d',
+ 'simlE;': '\u2a9f',
+ 'simne;': '\u2246',
+ 'simplus;': '\u2a24',
+ 'simrarr;': '\u2972',
+ 'slarr;': '\u2190',
+ 'SmallCircle;': '\u2218',
+ 'smallsetminus;': '\u2216',
+ 'smashp;': '\u2a33',
+ 'smeparsl;': '\u29e4',
+ 'smid;': '\u2223',
+ 'smile;': '\u2323',
+ 'smt;': '\u2aaa',
+ 'smte;': '\u2aac',
+ 'smtes;': '\u2aac\ufe00',
+ 'SOFTcy;': '\u042c',
+ 'softcy;': '\u044c',
+ 'sol;': '/',
+ 'solb;': '\u29c4',
+ 'solbar;': '\u233f',
+ 'Sopf;': '\U0001d54a',
+ 'sopf;': '\U0001d564',
+ 'spades;': '\u2660',
+ 'spadesuit;': '\u2660',
+ 'spar;': '\u2225',
+ 'sqcap;': '\u2293',
+ 'sqcaps;': '\u2293\ufe00',
+ 'sqcup;': '\u2294',
+ 'sqcups;': '\u2294\ufe00',
+ 'Sqrt;': '\u221a',
+ 'sqsub;': '\u228f',
+ 'sqsube;': '\u2291',
+ 'sqsubset;': '\u228f',
+ 'sqsubseteq;': '\u2291',
+ 'sqsup;': '\u2290',
+ 'sqsupe;': '\u2292',
+ 'sqsupset;': '\u2290',
+ 'sqsupseteq;': '\u2292',
+ 'squ;': '\u25a1',
+ 'Square;': '\u25a1',
+ 'square;': '\u25a1',
+ 'SquareIntersection;': '\u2293',
+ 'SquareSubset;': '\u228f',
+ 'SquareSubsetEqual;': '\u2291',
+ 'SquareSuperset;': '\u2290',
+ 'SquareSupersetEqual;': '\u2292',
+ 'SquareUnion;': '\u2294',
+ 'squarf;': '\u25aa',
+ 'squf;': '\u25aa',
+ 'srarr;': '\u2192',
+ 'Sscr;': '\U0001d4ae',
+ 'sscr;': '\U0001d4c8',
+ 'ssetmn;': '\u2216',
+ 'ssmile;': '\u2323',
+ 'sstarf;': '\u22c6',
+ 'Star;': '\u22c6',
+ 'star;': '\u2606',
+ 'starf;': '\u2605',
+ 'straightepsilon;': '\u03f5',
+ 'straightphi;': '\u03d5',
+ 'strns;': '\xaf',
+ 'Sub;': '\u22d0',
+ 'sub;': '\u2282',
+ 'subdot;': '\u2abd',
+ 'subE;': '\u2ac5',
+ 'sube;': '\u2286',
+ 'subedot;': '\u2ac3',
+ 'submult;': '\u2ac1',
+ 'subnE;': '\u2acb',
+ 'subne;': '\u228a',
+ 'subplus;': '\u2abf',
+ 'subrarr;': '\u2979',
+ 'Subset;': '\u22d0',
+ 'subset;': '\u2282',
+ 'subseteq;': '\u2286',
+ 'subseteqq;': '\u2ac5',
+ 'SubsetEqual;': '\u2286',
+ 'subsetneq;': '\u228a',
+ 'subsetneqq;': '\u2acb',
+ 'subsim;': '\u2ac7',
+ 'subsub;': '\u2ad5',
+ 'subsup;': '\u2ad3',
+ 'succ;': '\u227b',
+ 'succapprox;': '\u2ab8',
+ 'succcurlyeq;': '\u227d',
+ 'Succeeds;': '\u227b',
+ 'SucceedsEqual;': '\u2ab0',
+ 'SucceedsSlantEqual;': '\u227d',
+ 'SucceedsTilde;': '\u227f',
+ 'succeq;': '\u2ab0',
+ 'succnapprox;': '\u2aba',
+ 'succneqq;': '\u2ab6',
+ 'succnsim;': '\u22e9',
+ 'succsim;': '\u227f',
+ 'SuchThat;': '\u220b',
+ 'Sum;': '\u2211',
+ 'sum;': '\u2211',
+ 'sung;': '\u266a',
+ 'sup1': '\xb9',
+ 'sup1;': '\xb9',
+ 'sup2': '\xb2',
+ 'sup2;': '\xb2',
+ 'sup3': '\xb3',
+ 'sup3;': '\xb3',
+ 'Sup;': '\u22d1',
+ 'sup;': '\u2283',
+ 'supdot;': '\u2abe',
+ 'supdsub;': '\u2ad8',
+ 'supE;': '\u2ac6',
+ 'supe;': '\u2287',
+ 'supedot;': '\u2ac4',
+ 'Superset;': '\u2283',
+ 'SupersetEqual;': '\u2287',
+ 'suphsol;': '\u27c9',
+ 'suphsub;': '\u2ad7',
+ 'suplarr;': '\u297b',
+ 'supmult;': '\u2ac2',
+ 'supnE;': '\u2acc',
+ 'supne;': '\u228b',
+ 'supplus;': '\u2ac0',
+ 'Supset;': '\u22d1',
+ 'supset;': '\u2283',
+ 'supseteq;': '\u2287',
+ 'supseteqq;': '\u2ac6',
+ 'supsetneq;': '\u228b',
+ 'supsetneqq;': '\u2acc',
+ 'supsim;': '\u2ac8',
+ 'supsub;': '\u2ad4',
+ 'supsup;': '\u2ad6',
+ 'swarhk;': '\u2926',
+ 'swArr;': '\u21d9',
+ 'swarr;': '\u2199',
+ 'swarrow;': '\u2199',
+ 'swnwar;': '\u292a',
+ 'szlig': '\xdf',
+ 'szlig;': '\xdf',
+ 'Tab;': '\t',
+ 'target;': '\u2316',
+ 'Tau;': '\u03a4',
+ 'tau;': '\u03c4',
+ 'tbrk;': '\u23b4',
+ 'Tcaron;': '\u0164',
+ 'tcaron;': '\u0165',
+ 'Tcedil;': '\u0162',
+ 'tcedil;': '\u0163',
+ 'Tcy;': '\u0422',
+ 'tcy;': '\u0442',
+ 'tdot;': '\u20db',
+ 'telrec;': '\u2315',
+ 'Tfr;': '\U0001d517',
+ 'tfr;': '\U0001d531',
+ 'there4;': '\u2234',
+ 'Therefore;': '\u2234',
+ 'therefore;': '\u2234',
+ 'Theta;': '\u0398',
+ 'theta;': '\u03b8',
+ 'thetasym;': '\u03d1',
+ 'thetav;': '\u03d1',
+ 'thickapprox;': '\u2248',
+ 'thicksim;': '\u223c',
+ 'ThickSpace;': '\u205f\u200a',
+ 'thinsp;': '\u2009',
+ 'ThinSpace;': '\u2009',
+ 'thkap;': '\u2248',
+ 'thksim;': '\u223c',
+ 'THORN': '\xde',
+ 'thorn': '\xfe',
+ 'THORN;': '\xde',
+ 'thorn;': '\xfe',
+ 'Tilde;': '\u223c',
+ 'tilde;': '\u02dc',
+ 'TildeEqual;': '\u2243',
+ 'TildeFullEqual;': '\u2245',
+ 'TildeTilde;': '\u2248',
+ 'times': '\xd7',
+ 'times;': '\xd7',
+ 'timesb;': '\u22a0',
+ 'timesbar;': '\u2a31',
+ 'timesd;': '\u2a30',
+ 'tint;': '\u222d',
+ 'toea;': '\u2928',
+ 'top;': '\u22a4',
+ 'topbot;': '\u2336',
+ 'topcir;': '\u2af1',
+ 'Topf;': '\U0001d54b',
+ 'topf;': '\U0001d565',
+ 'topfork;': '\u2ada',
+ 'tosa;': '\u2929',
+ 'tprime;': '\u2034',
+ 'TRADE;': '\u2122',
+ 'trade;': '\u2122',
+ 'triangle;': '\u25b5',
+ 'triangledown;': '\u25bf',
+ 'triangleleft;': '\u25c3',
+ 'trianglelefteq;': '\u22b4',
+ 'triangleq;': '\u225c',
+ 'triangleright;': '\u25b9',
+ 'trianglerighteq;': '\u22b5',
+ 'tridot;': '\u25ec',
+ 'trie;': '\u225c',
+ 'triminus;': '\u2a3a',
+ 'TripleDot;': '\u20db',
+ 'triplus;': '\u2a39',
+ 'trisb;': '\u29cd',
+ 'tritime;': '\u2a3b',
+ 'trpezium;': '\u23e2',
+ 'Tscr;': '\U0001d4af',
+ 'tscr;': '\U0001d4c9',
+ 'TScy;': '\u0426',
+ 'tscy;': '\u0446',
+ 'TSHcy;': '\u040b',
+ 'tshcy;': '\u045b',
+ 'Tstrok;': '\u0166',
+ 'tstrok;': '\u0167',
+ 'twixt;': '\u226c',
+ 'twoheadleftarrow;': '\u219e',
+ 'twoheadrightarrow;': '\u21a0',
+ 'Uacute': '\xda',
+ 'uacute': '\xfa',
+ 'Uacute;': '\xda',
+ 'uacute;': '\xfa',
+ 'Uarr;': '\u219f',
+ 'uArr;': '\u21d1',
+ 'uarr;': '\u2191',
+ 'Uarrocir;': '\u2949',
+ 'Ubrcy;': '\u040e',
+ 'ubrcy;': '\u045e',
+ 'Ubreve;': '\u016c',
+ 'ubreve;': '\u016d',
+ 'Ucirc': '\xdb',
+ 'ucirc': '\xfb',
+ 'Ucirc;': '\xdb',
+ 'ucirc;': '\xfb',
+ 'Ucy;': '\u0423',
+ 'ucy;': '\u0443',
+ 'udarr;': '\u21c5',
+ 'Udblac;': '\u0170',
+ 'udblac;': '\u0171',
+ 'udhar;': '\u296e',
+ 'ufisht;': '\u297e',
+ 'Ufr;': '\U0001d518',
+ 'ufr;': '\U0001d532',
+ 'Ugrave': '\xd9',
+ 'ugrave': '\xf9',
+ 'Ugrave;': '\xd9',
+ 'ugrave;': '\xf9',
+ 'uHar;': '\u2963',
+ 'uharl;': '\u21bf',
+ 'uharr;': '\u21be',
+ 'uhblk;': '\u2580',
+ 'ulcorn;': '\u231c',
+ 'ulcorner;': '\u231c',
+ 'ulcrop;': '\u230f',
+ 'ultri;': '\u25f8',
+ 'Umacr;': '\u016a',
+ 'umacr;': '\u016b',
+ 'uml': '\xa8',
+ 'uml;': '\xa8',
+ 'UnderBar;': '_',
+ 'UnderBrace;': '\u23df',
+ 'UnderBracket;': '\u23b5',
+ 'UnderParenthesis;': '\u23dd',
+ 'Union;': '\u22c3',
+ 'UnionPlus;': '\u228e',
+ 'Uogon;': '\u0172',
+ 'uogon;': '\u0173',
+ 'Uopf;': '\U0001d54c',
+ 'uopf;': '\U0001d566',
+ 'UpArrow;': '\u2191',
+ 'Uparrow;': '\u21d1',
+ 'uparrow;': '\u2191',
+ 'UpArrowBar;': '\u2912',
+ 'UpArrowDownArrow;': '\u21c5',
+ 'UpDownArrow;': '\u2195',
+ 'Updownarrow;': '\u21d5',
+ 'updownarrow;': '\u2195',
+ 'UpEquilibrium;': '\u296e',
+ 'upharpoonleft;': '\u21bf',
+ 'upharpoonright;': '\u21be',
+ 'uplus;': '\u228e',
+ 'UpperLeftArrow;': '\u2196',
+ 'UpperRightArrow;': '\u2197',
+ 'Upsi;': '\u03d2',
+ 'upsi;': '\u03c5',
+ 'upsih;': '\u03d2',
+ 'Upsilon;': '\u03a5',
+ 'upsilon;': '\u03c5',
+ 'UpTee;': '\u22a5',
+ 'UpTeeArrow;': '\u21a5',
+ 'upuparrows;': '\u21c8',
+ 'urcorn;': '\u231d',
+ 'urcorner;': '\u231d',
+ 'urcrop;': '\u230e',
+ 'Uring;': '\u016e',
+ 'uring;': '\u016f',
+ 'urtri;': '\u25f9',
+ 'Uscr;': '\U0001d4b0',
+ 'uscr;': '\U0001d4ca',
+ 'utdot;': '\u22f0',
+ 'Utilde;': '\u0168',
+ 'utilde;': '\u0169',
+ 'utri;': '\u25b5',
+ 'utrif;': '\u25b4',
+ 'uuarr;': '\u21c8',
+ 'Uuml': '\xdc',
+ 'uuml': '\xfc',
+ 'Uuml;': '\xdc',
+ 'uuml;': '\xfc',
+ 'uwangle;': '\u29a7',
+ 'vangrt;': '\u299c',
+ 'varepsilon;': '\u03f5',
+ 'varkappa;': '\u03f0',
+ 'varnothing;': '\u2205',
+ 'varphi;': '\u03d5',
+ 'varpi;': '\u03d6',
+ 'varpropto;': '\u221d',
+ 'vArr;': '\u21d5',
+ 'varr;': '\u2195',
+ 'varrho;': '\u03f1',
+ 'varsigma;': '\u03c2',
+ 'varsubsetneq;': '\u228a\ufe00',
+ 'varsubsetneqq;': '\u2acb\ufe00',
+ 'varsupsetneq;': '\u228b\ufe00',
+ 'varsupsetneqq;': '\u2acc\ufe00',
+ 'vartheta;': '\u03d1',
+ 'vartriangleleft;': '\u22b2',
+ 'vartriangleright;': '\u22b3',
+ 'Vbar;': '\u2aeb',
+ 'vBar;': '\u2ae8',
+ 'vBarv;': '\u2ae9',
+ 'Vcy;': '\u0412',
+ 'vcy;': '\u0432',
+ 'VDash;': '\u22ab',
+ 'Vdash;': '\u22a9',
+ 'vDash;': '\u22a8',
+ 'vdash;': '\u22a2',
+ 'Vdashl;': '\u2ae6',
+ 'Vee;': '\u22c1',
+ 'vee;': '\u2228',
+ 'veebar;': '\u22bb',
+ 'veeeq;': '\u225a',
+ 'vellip;': '\u22ee',
+ 'Verbar;': '\u2016',
+ 'verbar;': '|',
+ 'Vert;': '\u2016',
+ 'vert;': '|',
+ 'VerticalBar;': '\u2223',
+ 'VerticalLine;': '|',
+ 'VerticalSeparator;': '\u2758',
+ 'VerticalTilde;': '\u2240',
+ 'VeryThinSpace;': '\u200a',
+ 'Vfr;': '\U0001d519',
+ 'vfr;': '\U0001d533',
+ 'vltri;': '\u22b2',
+ 'vnsub;': '\u2282\u20d2',
+ 'vnsup;': '\u2283\u20d2',
+ 'Vopf;': '\U0001d54d',
+ 'vopf;': '\U0001d567',
+ 'vprop;': '\u221d',
+ 'vrtri;': '\u22b3',
+ 'Vscr;': '\U0001d4b1',
+ 'vscr;': '\U0001d4cb',
+ 'vsubnE;': '\u2acb\ufe00',
+ 'vsubne;': '\u228a\ufe00',
+ 'vsupnE;': '\u2acc\ufe00',
+ 'vsupne;': '\u228b\ufe00',
+ 'Vvdash;': '\u22aa',
+ 'vzigzag;': '\u299a',
+ 'Wcirc;': '\u0174',
+ 'wcirc;': '\u0175',
+ 'wedbar;': '\u2a5f',
+ 'Wedge;': '\u22c0',
+ 'wedge;': '\u2227',
+ 'wedgeq;': '\u2259',
+ 'weierp;': '\u2118',
+ 'Wfr;': '\U0001d51a',
+ 'wfr;': '\U0001d534',
+ 'Wopf;': '\U0001d54e',
+ 'wopf;': '\U0001d568',
+ 'wp;': '\u2118',
+ 'wr;': '\u2240',
+ 'wreath;': '\u2240',
+ 'Wscr;': '\U0001d4b2',
+ 'wscr;': '\U0001d4cc',
+ 'xcap;': '\u22c2',
+ 'xcirc;': '\u25ef',
+ 'xcup;': '\u22c3',
+ 'xdtri;': '\u25bd',
+ 'Xfr;': '\U0001d51b',
+ 'xfr;': '\U0001d535',
+ 'xhArr;': '\u27fa',
+ 'xharr;': '\u27f7',
+ 'Xi;': '\u039e',
+ 'xi;': '\u03be',
+ 'xlArr;': '\u27f8',
+ 'xlarr;': '\u27f5',
+ 'xmap;': '\u27fc',
+ 'xnis;': '\u22fb',
+ 'xodot;': '\u2a00',
+ 'Xopf;': '\U0001d54f',
+ 'xopf;': '\U0001d569',
+ 'xoplus;': '\u2a01',
+ 'xotime;': '\u2a02',
+ 'xrArr;': '\u27f9',
+ 'xrarr;': '\u27f6',
+ 'Xscr;': '\U0001d4b3',
+ 'xscr;': '\U0001d4cd',
+ 'xsqcup;': '\u2a06',
+ 'xuplus;': '\u2a04',
+ 'xutri;': '\u25b3',
+ 'xvee;': '\u22c1',
+ 'xwedge;': '\u22c0',
+ 'Yacute': '\xdd',
+ 'yacute': '\xfd',
+ 'Yacute;': '\xdd',
+ 'yacute;': '\xfd',
+ 'YAcy;': '\u042f',
+ 'yacy;': '\u044f',
+ 'Ycirc;': '\u0176',
+ 'ycirc;': '\u0177',
+ 'Ycy;': '\u042b',
+ 'ycy;': '\u044b',
+ 'yen': '\xa5',
+ 'yen;': '\xa5',
+ 'Yfr;': '\U0001d51c',
+ 'yfr;': '\U0001d536',
+ 'YIcy;': '\u0407',
+ 'yicy;': '\u0457',
+ 'Yopf;': '\U0001d550',
+ 'yopf;': '\U0001d56a',
+ 'Yscr;': '\U0001d4b4',
+ 'yscr;': '\U0001d4ce',
+ 'YUcy;': '\u042e',
+ 'yucy;': '\u044e',
+ 'yuml': '\xff',
+ 'Yuml;': '\u0178',
+ 'yuml;': '\xff',
+ 'Zacute;': '\u0179',
+ 'zacute;': '\u017a',
+ 'Zcaron;': '\u017d',
+ 'zcaron;': '\u017e',
+ 'Zcy;': '\u0417',
+ 'zcy;': '\u0437',
+ 'Zdot;': '\u017b',
+ 'zdot;': '\u017c',
+ 'zeetrf;': '\u2128',
+ 'ZeroWidthSpace;': '\u200b',
+ 'Zeta;': '\u0396',
+ 'zeta;': '\u03b6',
+ 'Zfr;': '\u2128',
+ 'zfr;': '\U0001d537',
+ 'ZHcy;': '\u0416',
+ 'zhcy;': '\u0436',
+ 'zigrarr;': '\u21dd',
+ 'Zopf;': '\u2124',
+ 'zopf;': '\U0001d56b',
+ 'Zscr;': '\U0001d4b5',
+ 'zscr;': '\U0001d4cf',
+ 'zwj;': '\u200d',
+ 'zwnj;': '\u200c',
+ }
+
try:
import http.client as compat_http_client
except ImportError: # Python 2
@@ -77,6 +2316,10 @@ try:
except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve
+try:
+ from html.parser import HTMLParser as compat_HTMLParser
+except ImportError: # Python 2
+ from HTMLParser import HTMLParser as compat_HTMLParser
try:
from subprocess import DEVNULL
@@ -165,6 +2408,32 @@ except ImportError: # Python 2
return compat_urllib_parse_unquote(string, encoding, errors)
try:
+ from urllib.parse import urlencode as compat_urllib_parse_urlencode
+except ImportError: # Python 2
+ # Python 2 will choke in urlencode on mixture of byte and unicode strings.
+ # Possible solutions are to either port it from python 3 with all
+ # the friends or manually ensure input query contains only byte strings.
+ # We will stick with latter thus recursively encoding the whole query.
+ def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'):
+ def encode_elem(e):
+ if isinstance(e, dict):
+ e = encode_dict(e)
+ elif isinstance(e, (list, tuple,)):
+ list_e = encode_list(e)
+ e = tuple(list_e) if isinstance(e, tuple) else list_e
+ elif isinstance(e, compat_str):
+ e = e.encode(encoding)
+ return e
+
+ def encode_dict(d):
+ return dict((encode_elem(k), encode_elem(v)) for k, v in d.items())
+
+ def encode_list(l):
+ return [encode_elem(e) for e in l]
+
+ return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq)
+
+try:
from urllib.request import DataHandler as compat_urllib_request_DataHandler
except ImportError: # Python < 3.4
# Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py
@@ -213,13 +2482,20 @@ try:
except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error
+
+etree = xml.etree.ElementTree
+
+
+class _TreeBuilder(etree.TreeBuilder):
+ def doctype(self, name, pubid, system):
+ pass
+
if sys.version_info[0] >= 3:
- compat_etree_fromstring = xml.etree.ElementTree.fromstring
+ def compat_etree_fromstring(text):
+ return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
else:
# python 2.x tries to encode unicode strings with ascii (see the
# XMLParser._fixtext method)
- etree = xml.etree.ElementTree
-
try:
_etree_iter = etree.Element.iter
except AttributeError: # Python <=2.6
@@ -233,7 +2509,7 @@ else:
# 2.7 source
def _XML(text, parser=None):
if not parser:
- parser = etree.XMLParser(target=etree.TreeBuilder())
+ parser = etree.XMLParser(target=_TreeBuilder())
parser.feed(text)
return parser.close()
@@ -245,12 +2521,22 @@ else:
return el
def compat_etree_fromstring(text):
- doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
+ doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
for el in _etree_iter(doc):
if el.text is not None and isinstance(el.text, bytes):
el.text = el.text.decode('utf-8')
return doc
+if sys.version_info < (2, 7):
+ # Here comes the crazy part: In 2.6, if the xpath is a unicode,
+ # .//node does not match if a node is a direct child of . !
+ def compat_xpath(xpath):
+ if isinstance(xpath, compat_str):
+ xpath = xpath.encode('ascii')
+ return xpath
+else:
+ compat_xpath = lambda xpath: xpath
+
try:
from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2
@@ -299,9 +2585,9 @@ except ImportError: # Python 2
return parsed_result
try:
- from shlex import quote as shlex_quote
+ from shlex import quote as compat_shlex_quote
except ImportError: # Python < 3.3
- def shlex_quote(s):
+ def compat_shlex_quote(s):
if re.match(r'^[-_\w./]+$', s):
return s
else:
@@ -326,9 +2612,15 @@ def compat_ord(c):
return ord(c)
+compat_os_name = os._name if os.name == 'java' else os.name
+
+
if sys.version_info >= (3, 0):
compat_getenv = os.getenv
compat_expanduser = os.path.expanduser
+
+ def compat_setenv(key, value, env=os.environ):
+ env[key] = value
else:
# Environment variables should be decoded with filesystem encoding.
# Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
@@ -340,13 +2632,19 @@ else:
env = env.decode(get_filesystem_encoding())
return env
+ def compat_setenv(key, value, env=os.environ):
+ def encode(v):
+ from .utils import get_filesystem_encoding
+ return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
+ env[encode(key)] = encode(value)
+
# HACK: The default implementations of os.path.expanduser from cpython do not decode
# environment variables with filesystem encoding. We will work around this by
# providing adjusted implementations.
# The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
# for different platforms with correct environment variables decoding.
- if os.name == 'posix':
+ if compat_os_name == 'posix':
def compat_expanduser(path):
"""Expand ~ and ~user constructions. If user or $HOME is unknown,
do nothing."""
@@ -370,7 +2668,7 @@ else:
userhome = pwent.pw_dir
userhome = userhome.rstrip('/')
return (userhome + path[i:]) or '/'
- elif os.name == 'nt' or os.name == 'ce':
+ elif compat_os_name == 'nt' or compat_os_name == 'ce':
def compat_expanduser(path):
"""Expand ~ and ~user constructs.
@@ -412,18 +2710,6 @@ else:
print(s)
-try:
- subprocess_check_output = subprocess.check_output
-except AttributeError:
- def subprocess_check_output(*args, **kwargs):
- assert 'input' not in kwargs
- p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
- output, _ = p.communicate()
- ret = p.poll()
- if ret:
- raise subprocess.CalledProcessError(ret, p.args, output=output)
- return output
-
if sys.version_info < (3, 0) and sys.platform == 'win32':
def compat_getpass(prompt, *args, **kwargs):
if isinstance(prompt, compat_str):
@@ -433,6 +2719,11 @@ if sys.version_info < (3, 0) and sys.platform == 'win32':
else:
compat_getpass = getpass.getpass
+try:
+ compat_input = raw_input
+except NameError: # Python 3
+ compat_input = input
+
# Python < 2.6.5 require kwargs to be bytes
try:
def _testfunc(x):
@@ -539,7 +2830,28 @@ if sys.version_info >= (3, 0):
else:
from tokenize import generate_tokens as compat_tokenize_tokenize
+
+try:
+ struct.pack('!I', 0)
+except TypeError:
+ # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
+ # See https://bugs.python.org/issue19099
+ def compat_struct_pack(spec, *args):
+ if isinstance(spec, compat_str):
+ spec = spec.encode('ascii')
+ return struct.pack(spec, *args)
+
+ def compat_struct_unpack(spec, *args):
+ if isinstance(spec, compat_str):
+ spec = spec.encode('ascii')
+ return struct.unpack(spec, *args)
+else:
+ compat_struct_pack = struct.pack
+ compat_struct_unpack = struct.unpack
+
+
__all__ = [
+ 'compat_HTMLParser',
'compat_HTTPError',
'compat_basestring',
'compat_chr',
@@ -551,16 +2863,23 @@ __all__ = [
'compat_getenv',
'compat_getpass',
'compat_html_entities',
+ 'compat_html_entities_html5',
'compat_http_client',
'compat_http_server',
+ 'compat_input',
'compat_itertools_count',
'compat_kwargs',
'compat_ord',
+ 'compat_os_name',
'compat_parse_qs',
'compat_print',
+ 'compat_setenv',
+ 'compat_shlex_quote',
'compat_shlex_split',
'compat_socket_create_connection',
'compat_str',
+ 'compat_struct_pack',
+ 'compat_struct_unpack',
'compat_subprocess_get_DEVNULL',
'compat_tokenize_tokenize',
'compat_urllib_error',
@@ -568,6 +2887,7 @@ __all__ = [
'compat_urllib_parse_unquote',
'compat_urllib_parse_unquote_plus',
'compat_urllib_parse_unquote_to_bytes',
+ 'compat_urllib_parse_urlencode',
'compat_urllib_parse_urlparse',
'compat_urllib_request',
'compat_urllib_request_DataHandler',
@@ -575,7 +2895,6 @@ __all__ = [
'compat_urlparse',
'compat_urlretrieve',
'compat_xml_parse_error',
- 'shlex_quote',
- 'subprocess_check_output',
+ 'compat_xpath',
'workaround_optparse_bug9161',
]
diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py
index dccc592..817591d 100644
--- a/youtube_dl/downloader/__init__.py
+++ b/youtube_dl/downloader/__init__.py
@@ -1,14 +1,16 @@
from __future__ import unicode_literals
from .common import FileDownloader
-from .external import get_external_downloader
from .f4m import F4mFD
from .hls import HlsFD
-from .hls import NativeHlsFD
from .http import HttpFD
-from .rtsp import RtspFD
from .rtmp import RtmpFD
from .dash import DashSegmentsFD
+from .rtsp import RtspFD
+from .external import (
+ get_external_downloader,
+ FFmpegFD,
+)
from ..utils import (
determine_protocol,
@@ -16,8 +18,8 @@ from ..utils import (
PROTOCOL_MAP = {
'rtmp': RtmpFD,
- 'm3u8_native': NativeHlsFD,
- 'm3u8': HlsFD,
+ 'm3u8_native': HlsFD,
+ 'm3u8': FFmpegFD,
'mms': RtspFD,
'rtsp': RtspFD,
'f4m': F4mFD,
@@ -30,14 +32,20 @@ def get_suitable_downloader(info_dict, params={}):
protocol = determine_protocol(info_dict)
info_dict['protocol'] = protocol
+ # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict):
+ # return FFmpegFD
+
external_downloader = params.get('external_downloader')
if external_downloader is not None:
ed = get_external_downloader(external_downloader)
- if ed.supports(info_dict):
+ if ed.can_download(info_dict):
return ed
- if protocol == 'm3u8' and params.get('hls_prefer_native'):
- return NativeHlsFD
+ if protocol == 'm3u8' and params.get('hls_prefer_native') is True:
+ return HlsFD
+
+ if protocol == 'm3u8_native' and params.get('hls_prefer_native') is False:
+ return FFmpegFD
return PROTOCOL_MAP.get(protocol, HttpFD)
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 2d51540..1dba9f4 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -5,6 +5,7 @@ import re
import sys
import time
+from ..compat import compat_os_name
from ..utils import (
encodeFilename,
error_to_compat_str,
@@ -115,6 +116,10 @@ class FileDownloader(object):
return '%10s' % ('%s/s' % format_bytes(speed))
@staticmethod
+ def format_retries(retries):
+ return 'inf' if retries == float('inf') else '%.0f' % retries
+
+ @staticmethod
def best_block_size(elapsed_time, bytes):
new_min = max(bytes / 2.0, 1.0)
new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
@@ -219,7 +224,7 @@ class FileDownloader(object):
if self.params.get('progress_with_newline', False):
self.to_screen(fullmsg)
else:
- if os.name == 'nt':
+ if compat_os_name == 'nt':
prev_len = getattr(self, '_report_progress_prev_line_length',
0)
if prev_len > len(fullmsg):
@@ -296,7 +301,9 @@ class FileDownloader(object):
def report_retry(self, count, retries):
"""Report retry in case of HTTP error 5xx"""
- self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries))
+ self.to_screen(
+ '[download] Got server HTTP error. Retrying (attempt %d of %s)...'
+ % (count, self.format_retries(retries)))
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
index 8b1b17c..8bbab9d 100644
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@@ -4,6 +4,7 @@ import os
import re
from .fragment import FragmentFD
+from ..compat import compat_urllib_error
from ..utils import (
sanitize_open,
encodeFilename,
@@ -36,20 +37,41 @@ class DashSegmentsFD(FragmentFD):
segments_filenames = []
- def append_url_to_file(target_url, target_filename):
- success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)})
- if not success:
+ fragment_retries = self.params.get('fragment_retries', 0)
+
+ def append_url_to_file(target_url, tmp_filename, segment_name):
+ target_filename = '%s-%s' % (tmp_filename, segment_name)
+ count = 0
+ while count <= fragment_retries:
+ try:
+ success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)})
+ if not success:
+ return False
+ down, target_sanitized = sanitize_open(target_filename, 'rb')
+ ctx['dest_stream'].write(down.read())
+ down.close()
+ segments_filenames.append(target_sanitized)
+ break
+ except (compat_urllib_error.HTTPError, ) as err:
+ # YouTube may often return 404 HTTP error for a fragment causing the
+ # whole download to fail. However if the same fragment is immediately
+ # retried with the same request data this usually succeeds (1-2 attemps
+ # is usually enough) thus allowing to download the whole file successfully.
+ # So, we will retry all fragments that fail with 404 HTTP error for now.
+ if err.code != 404:
+ raise
+ # Retry fragment
+ count += 1
+ if count <= fragment_retries:
+ self.report_retry_fragment(segment_name, count, fragment_retries)
+ if count > fragment_retries:
+ self.report_error('giving up after %s fragment retries' % fragment_retries)
return False
- down, target_sanitized = sanitize_open(target_filename, 'rb')
- ctx['dest_stream'].write(down.read())
- down.close()
- segments_filenames.append(target_sanitized)
if initialization_url:
- append_url_to_file(initialization_url, ctx['tmpfilename'] + '-Init')
+ append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init')
for i, segment_url in enumerate(segment_urls):
- segment_filename = '%s-Seg%d' % (ctx['tmpfilename'], i)
- append_url_to_file(segment_url, segment_filename)
+ append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i)
self._finish_frag_download(ctx)
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py
index 2bc0112..fae2450 100644
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@@ -2,8 +2,12 @@ from __future__ import unicode_literals
import os.path
import subprocess
+import sys
+import re
from .common import FileDownloader
+from ..compat import compat_setenv
+from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS
from ..utils import (
cli_option,
cli_valueless_option,
@@ -11,6 +15,8 @@ from ..utils import (
cli_configuration_args,
encodeFilename,
encodeArgument,
+ handle_youtubedl_headers,
+ check_executable,
)
@@ -46,9 +52,17 @@ class ExternalFD(FileDownloader):
return self.params.get('external_downloader')
@classmethod
+ def available(cls):
+ return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT])
+
+ @classmethod
def supports(cls, info_dict):
return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps')
+ @classmethod
+ def can_download(cls, info_dict):
+ return cls.available() and cls.supports(info_dict)
+
def _option(self, command_option, param):
return cli_option(self.params, command_option, param)
@@ -71,11 +85,13 @@ class ExternalFD(FileDownloader):
cmd, stderr=subprocess.PIPE)
_, stderr = p.communicate()
if p.returncode != 0:
- self.to_stderr(stderr)
+ self.to_stderr(stderr.decode('utf-8', 'replace'))
return p.returncode
class CurlFD(ExternalFD):
+ AVAILABLE_OPT = '-V'
+
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '--location', '-o', tmpfilename]
for key, val in info_dict['http_headers'].items():
@@ -89,6 +105,8 @@ class CurlFD(ExternalFD):
class AxelFD(ExternalFD):
+ AVAILABLE_OPT = '-V'
+
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-o', tmpfilename]
for key, val in info_dict['http_headers'].items():
@@ -99,6 +117,8 @@ class AxelFD(ExternalFD):
class WgetFD(ExternalFD):
+ AVAILABLE_OPT = '--version'
+
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items():
@@ -112,6 +132,8 @@ class WgetFD(ExternalFD):
class Aria2cFD(ExternalFD):
+ AVAILABLE_OPT = '-v'
+
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-c']
cmd += self._configuration_args([
@@ -130,12 +152,125 @@ class Aria2cFD(ExternalFD):
class HttpieFD(ExternalFD):
+ @classmethod
+ def available(cls):
+ return check_executable('http', ['--version'])
+
def _make_cmd(self, tmpfilename, info_dict):
cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
for key, val in info_dict['http_headers'].items():
cmd += ['%s:%s' % (key, val)]
return cmd
+
+class FFmpegFD(ExternalFD):
+ @classmethod
+ def supports(cls, info_dict):
+ return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms')
+
+ @classmethod
+ def available(cls):
+ return FFmpegPostProcessor().available
+
+ def _call_downloader(self, tmpfilename, info_dict):
+ url = info_dict['url']
+ ffpp = FFmpegPostProcessor(downloader=self)
+ if not ffpp.available:
+ self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
+ return False
+ ffpp.check_version()
+
+ args = [ffpp.executable, '-y']
+
+ args += self._configuration_args()
+
+ # start_time = info_dict.get('start_time') or 0
+ # if start_time:
+ # args += ['-ss', compat_str(start_time)]
+ # end_time = info_dict.get('end_time')
+ # if end_time:
+ # args += ['-t', compat_str(end_time - start_time)]
+
+ if info_dict['http_headers'] and re.match(r'^https?://', url):
+ # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
+ # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
+ headers = handle_youtubedl_headers(info_dict['http_headers'])
+ args += [
+ '-headers',
+ ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())]
+
+ env = None
+ proxy = self.params.get('proxy')
+ if proxy:
+ if not re.match(r'^[\da-zA-Z]+://', proxy):
+ proxy = 'http://%s' % proxy
+ # Since December 2015 ffmpeg supports -http_proxy option (see
+ # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd)
+ # We could switch to the following code if we are able to detect version properly
+ # args += ['-http_proxy', proxy]
+ env = os.environ.copy()
+ compat_setenv('HTTP_PROXY', proxy, env=env)
+ compat_setenv('http_proxy', proxy, env=env)
+
+ protocol = info_dict.get('protocol')
+
+ if protocol == 'rtmp':
+ player_url = info_dict.get('player_url')
+ page_url = info_dict.get('page_url')
+ app = info_dict.get('app')
+ play_path = info_dict.get('play_path')
+ tc_url = info_dict.get('tc_url')
+ flash_version = info_dict.get('flash_version')
+ live = info_dict.get('rtmp_live', False)
+ if player_url is not None:
+ args += ['-rtmp_swfverify', player_url]
+ if page_url is not None:
+ args += ['-rtmp_pageurl', page_url]
+ if app is not None:
+ args += ['-rtmp_app', app]
+ if play_path is not None:
+ args += ['-rtmp_playpath', play_path]
+ if tc_url is not None:
+ args += ['-rtmp_tcurl', tc_url]
+ if flash_version is not None:
+ args += ['-rtmp_flashver', flash_version]
+ if live:
+ args += ['-rtmp_live', 'live']
+
+ args += ['-i', url, '-c', 'copy']
+ if protocol in ('m3u8', 'm3u8_native'):
+ if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
+ args += ['-f', 'mpegts']
+ else:
+ args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
+ elif protocol == 'rtmp':
+ args += ['-f', 'flv']
+ else:
+ args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])]
+
+ args = [encodeArgument(opt) for opt in args]
+ args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))
+
+ self._debug_cmd(args)
+
+ proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env)
+ try:
+ retval = proc.wait()
+ except KeyboardInterrupt:
+ # subprocces.run would send the SIGKILL signal to ffmpeg and the
+ # mp4 file couldn't be played, but if we ask ffmpeg to quit it
+ # produces a file that is playable (this is mostly useful for live
+ # streams). Note that Windows is not affected and produces playable
+ # files (see https://github.com/rg3/youtube-dl/issues/8300).
+ if sys.platform != 'win32':
+ proc.communicate(b'q')
+ raise
+ return retval
+
+
+class AVconvFD(FFmpegFD):
+ pass
+
_BY_NAME = dict(
(klass.get_basename(), klass)
for name, klass in globals().items()
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index fc96429..8f88b02 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -12,37 +12,49 @@ from ..compat import (
compat_urlparse,
compat_urllib_error,
compat_urllib_parse_urlparse,
+ compat_struct_pack,
+ compat_struct_unpack,
)
from ..utils import (
encodeFilename,
fix_xml_ampersands,
sanitize_open,
- struct_pack,
- struct_unpack,
xpath_text,
)
+class DataTruncatedError(Exception):
+ pass
+
+
class FlvReader(io.BytesIO):
"""
Reader for Flv files
The file format is documented in https://www.adobe.com/devnet/f4v.html
"""
+ def read_bytes(self, n):
+ data = self.read(n)
+ if len(data) < n:
+ raise DataTruncatedError(
+ 'FlvReader error: need %d bytes while only %d bytes got' % (
+ n, len(data)))
+ return data
+
# Utility functions for reading numbers and strings
def read_unsigned_long_long(self):
- return struct_unpack('!Q', self.read(8))[0]
+ return compat_struct_unpack('!Q', self.read_bytes(8))[0]
def read_unsigned_int(self):
- return struct_unpack('!I', self.read(4))[0]
+ return compat_struct_unpack('!I', self.read_bytes(4))[0]
def read_unsigned_char(self):
- return struct_unpack('!B', self.read(1))[0]
+ return compat_struct_unpack('!B', self.read_bytes(1))[0]
def read_string(self):
res = b''
while True:
- char = self.read(1)
+ char = self.read_bytes(1)
if char == b'\x00':
break
res += char
@@ -53,18 +65,18 @@ class FlvReader(io.BytesIO):
Read a box and return the info as a tuple: (box_size, box_type, box_data)
"""
real_size = size = self.read_unsigned_int()
- box_type = self.read(4)
+ box_type = self.read_bytes(4)
header_end = 8
if size == 1:
real_size = self.read_unsigned_long_long()
header_end = 16
- return real_size, box_type, self.read(real_size - header_end)
+ return real_size, box_type, self.read_bytes(real_size - header_end)
def read_asrt(self):
# version
self.read_unsigned_char()
# flags
- self.read(3)
+ self.read_bytes(3)
quality_entry_count = self.read_unsigned_char()
# QualityEntryCount
for i in range(quality_entry_count):
@@ -85,7 +97,7 @@ class FlvReader(io.BytesIO):
# version
self.read_unsigned_char()
# flags
- self.read(3)
+ self.read_bytes(3)
# time scale
self.read_unsigned_int()
@@ -119,7 +131,7 @@ class FlvReader(io.BytesIO):
# version
self.read_unsigned_char()
# flags
- self.read(3)
+ self.read_bytes(3)
self.read_unsigned_int() # BootstrapinfoVersion
# Profile,Live,Update,Reserved
@@ -194,11 +206,11 @@ def build_fragments_list(boot_info):
def write_unsigned_int(stream, val):
- stream.write(struct_pack('!I', val))
+ stream.write(compat_struct_pack('!I', val))
def write_unsigned_int_24(stream, val):
- stream.write(struct_pack('!I', val)[1:])
+ stream.write(compat_struct_pack('!I', val)[1:])
def write_flv_header(stream):
@@ -223,6 +235,12 @@ def write_metadata_tag(stream, metadata):
write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))
+def remove_encrypted_media(media):
+ return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and
+ 'drmAdditionalHeaderSetId' not in e.attrib,
+ media))
+
+
def _add_ns(prop):
return '{http://ns.adobe.com/f4m/1.0}%s' % prop
@@ -244,9 +262,7 @@ class F4mFD(FragmentFD):
# without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute
if 'id' not in e.attrib:
self.report_error('Missing ID in f4m DRM')
- media = list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and
- 'drmAdditionalHeaderSetId' not in e.attrib,
- media))
+ media = remove_encrypted_media(media)
if not media:
self.report_error('Unsupported DRM')
return media
@@ -303,7 +319,7 @@ class F4mFD(FragmentFD):
doc = compat_etree_fromstring(manifest)
formats = [(int(f.attrib.get('bitrate', -1)), f)
for f in self._get_unencrypted_media(doc)]
- if requested_bitrate is None:
+ if requested_bitrate is None or len(formats) == 1:
# get the best format
formats = sorted(formats, key=lambda f: f[0])
rate, media = formats[-1]
@@ -370,7 +386,17 @@ class F4mFD(FragmentFD):
down.close()
reader = FlvReader(down_data)
while True:
- _, box_type, box_data = reader.read_box_info()
+ try:
+ _, box_type, box_data = reader.read_box_info()
+ except DataTruncatedError:
+ if test:
+ # In tests, segments may be truncated, and thus
+ # FlvReader may not be able to parse the whole
+ # chunk. If so, write the segment as is
+ # See https://github.com/rg3/youtube-dl/issues/9214
+ dest_stream.write(down_data)
+ break
+ raise
if box_type == b'mdat':
dest_stream.write(box_data)
break
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
index 5bc9949..ba903ae 100644
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@@ -19,8 +19,17 @@ class HttpQuietDownloader(HttpFD):
class FragmentFD(FileDownloader):
"""
A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests).
+
+ Available options:
+
+ fragment_retries: Number of times to retry a fragment for HTTP error (DASH only)
"""
+ def report_retry_fragment(self, fragment_name, count, retries):
+ self.to_screen(
+ '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...'
+ % (fragment_name, count, self.format_retries(retries)))
+
def _prepare_and_start_frag_download(self, ctx):
self._prepare_frag_download(ctx)
self._start_frag_download(ctx)
@@ -99,7 +108,8 @@ class FragmentFD(FileDownloader):
state['eta'] = self.calc_eta(
start, time_now, estimated_size,
state['downloaded_bytes'])
- state['speed'] = s.get('speed')
+ state['speed'] = s.get('speed') or ctx.get('speed')
+ ctx['speed'] = state['speed']
ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes
self._hook_progress(state)
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 2a775bf..3b7bb35 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -1,126 +1,128 @@
from __future__ import unicode_literals
-import os
+import os.path
import re
-import subprocess
-import sys
+import binascii
+try:
+ from Crypto.Cipher import AES
+ can_decrypt_frag = True
+except ImportError:
+ can_decrypt_frag = False
-from .common import FileDownloader
from .fragment import FragmentFD
+from .external import FFmpegFD
-from ..compat import compat_urlparse
-from ..postprocessor.ffmpeg import FFmpegPostProcessor
+from ..compat import (
+ compat_urlparse,
+ compat_struct_pack,
+)
from ..utils import (
- encodeArgument,
encodeFilename,
sanitize_open,
- handle_youtubedl_headers,
+ parse_m3u8_attributes,
)
-class HlsFD(FileDownloader):
- def real_download(self, filename, info_dict):
- url = info_dict['url']
- self.report_destination(filename)
- tmpfilename = self.temp_name(filename)
-
- ffpp = FFmpegPostProcessor(downloader=self)
- if not ffpp.available:
- self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.')
- return False
- ffpp.check_version()
-
- args = [ffpp.executable, '-y']
-
- if info_dict['http_headers'] and re.match(r'^https?://', url):
- # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
- # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
- headers = handle_youtubedl_headers(info_dict['http_headers'])
- args += [
- '-headers',
- ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())]
-
- args += ['-i', url, '-c', 'copy']
- if self.params.get('hls_use_mpegts', False):
- args += ['-f', 'mpegts']
- else:
- args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
-
- args = [encodeArgument(opt) for opt in args]
- args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True))
-
- self._debug_cmd(args)
-
- proc = subprocess.Popen(args, stdin=subprocess.PIPE)
- try:
- retval = proc.wait()
- except KeyboardInterrupt:
- # subprocces.run would send the SIGKILL signal to ffmpeg and the
- # mp4 file couldn't be played, but if we ask ffmpeg to quit it
- # produces a file that is playable (this is mostly useful for live
- # streams). Note that Windows is not affected and produces playable
- # files (see https://github.com/rg3/youtube-dl/issues/8300).
- if sys.platform != 'win32':
- proc.communicate(b'q')
- raise
- if retval == 0:
- fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen('\r[%s] %s bytes' % (args[0], fsize))
- self.try_rename(tmpfilename, filename)
- self._hook_progress({
- 'downloaded_bytes': fsize,
- 'total_bytes': fsize,
- 'filename': filename,
- 'status': 'finished',
- })
- return True
- else:
- self.to_stderr('\n')
- self.report_error('%s exited with code %d' % (ffpp.basename, retval))
- return False
-
-
-class NativeHlsFD(FragmentFD):
- """ A more limited implementation that does not require ffmpeg """
+class HlsFD(FragmentFD):
+ """ A limited implementation that does not require ffmpeg """
FD_NAME = 'hlsnative'
+ @staticmethod
+ def can_download(manifest):
+ UNSUPPORTED_FEATURES = (
+ r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1]
+ r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
+
+ # Live streams heuristic does not always work (e.g. geo restricted to Germany
+ # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
+ # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3]
+
+ # This heuristic also is not correct since segments may not be appended as well.
+ # Twitch vods of finished streams have EXT-X-PLAYLIST-TYPE:EVENT despite
+ # no segments will definitely be appended to the end of the playlist.
+ # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of
+ # # event media playlists [4]
+
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4
+ # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2
+ # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2
+ # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
+ )
+ check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]
+ check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest)
+ return all(check_results)
+
def real_download(self, filename, info_dict):
man_url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
manifest = self.ydl.urlopen(man_url).read()
s = manifest.decode('utf-8', 'ignore')
- fragment_urls = []
+
+ if not self.can_download(s):
+ self.report_warning(
+ 'hlsnative has detected features it does not support, '
+ 'extraction will be delegated to ffmpeg')
+ fd = FFmpegFD(self.ydl, self.params)
+ for ph in self._progress_hooks:
+ fd.add_progress_hook(ph)
+ return fd.real_download(filename, info_dict)
+
+ total_frags = 0
for line in s.splitlines():
line = line.strip()
if line and not line.startswith('#'):
- segment_url = (
- line
- if re.match(r'^https?://', line)
- else compat_urlparse.urljoin(man_url, line))
- fragment_urls.append(segment_url)
- # We only download the first fragment during the test
- if self.params.get('test', False):
- break
+ total_frags += 1
ctx = {
'filename': filename,
- 'total_frags': len(fragment_urls),
+ 'total_frags': total_frags,
}
self._prepare_and_start_frag_download(ctx)
+ i = 0
+ media_sequence = 0
+ decrypt_info = {'METHOD': 'NONE'}
frags_filenames = []
- for i, frag_url in enumerate(fragment_urls):
- frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
- success = ctx['dl'].download(frag_filename, {'url': frag_url})
- if not success:
- return False
- down, frag_sanitized = sanitize_open(frag_filename, 'rb')
- ctx['dest_stream'].write(down.read())
- down.close()
- frags_filenames.append(frag_sanitized)
+ for line in s.splitlines():
+ line = line.strip()
+ if line:
+ if not line.startswith('#'):
+ frag_url = (
+ line
+ if re.match(r'^https?://', line)
+ else compat_urlparse.urljoin(man_url, line))
+ frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
+ success = ctx['dl'].download(frag_filename, {'url': frag_url})
+ if not success:
+ return False
+ down, frag_sanitized = sanitize_open(frag_filename, 'rb')
+ frag_content = down.read()
+ down.close()
+ if decrypt_info['METHOD'] == 'AES-128':
+ iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
+ frag_content = AES.new(
+ decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
+ ctx['dest_stream'].write(frag_content)
+ frags_filenames.append(frag_sanitized)
+ # We only download the first fragment during the test
+ if self.params.get('test', False):
+ break
+ i += 1
+ media_sequence += 1
+ elif line.startswith('#EXT-X-KEY'):
+ decrypt_info = parse_m3u8_attributes(line[11:])
+ if decrypt_info['METHOD'] == 'AES-128':
+ if 'IV' in decrypt_info:
+ decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:])
+ if not re.match(r'^https?://', decrypt_info['URI']):
+ decrypt_info['URI'] = compat_urlparse.urljoin(
+ man_url, decrypt_info['URI'])
+ decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
+ elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
+ media_sequence = int(line[22:])
self._finish_frag_download(ctx)
diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py
index 3eb2952..939358b 100644
--- a/youtube_dl/downloader/rtsp.py
+++ b/youtube_dl/downloader/rtsp.py
@@ -27,6 +27,8 @@ class RtspFD(FileDownloader):
self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.')
return False
+ self._debug_cmd(args)
+
retval = subprocess.call(args)
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index 1ae606f..18d8dbc 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,973 +1,33 @@
from __future__ import unicode_literals
-from .abc import ABCIE
-from .abc7news import Abc7NewsIE
-from .academicearth import AcademicEarthCourseIE
-from .acast import (
- ACastIE,
- ACastChannelIE,
-)
-from .addanime import AddAnimeIE
-from .adobetv import (
- AdobeTVIE,
- AdobeTVShowIE,
- AdobeTVChannelIE,
- AdobeTVVideoIE,
-)
-from .adultswim import AdultSwimIE
-from .aenetworks import AENetworksIE
-from .aftonbladet import AftonbladetIE
-from .airmozilla import AirMozillaIE
-from .aljazeera import AlJazeeraIE
-from .alphaporno import AlphaPornoIE
-from .animeondemand import AnimeOnDemandIE
-from .anitube import AnitubeIE
-from .anysex import AnySexIE
-from .aol import AolIE
-from .allocine import AllocineIE
-from .aparat import AparatIE
-from .appleconnect import AppleConnectIE
-from .appletrailers import (
- AppleTrailersIE,
- AppleTrailersSectionIE,
-)
-from .archiveorg import ArchiveOrgIE
-from .ard import (
- ARDIE,
- ARDMediathekIE,
- SportschauIE,
-)
-from .arte import (
- ArteTvIE,
- ArteTVPlus7IE,
- ArteTVCreativeIE,
- ArteTVConcertIE,
- ArteTVFutureIE,
- ArteTVCinemaIE,
- ArteTVDDCIE,
- ArteTVMagazineIE,
- ArteTVEmbedIE,
-)
-from .atresplayer import AtresPlayerIE
-from .atttechchannel import ATTTechChannelIE
-from .audimedia import AudiMediaIE
-from .audiomack import AudiomackIE, AudiomackAlbumIE
-from .azubu import AzubuIE, AzubuLiveIE
-from .baidu import BaiduVideoIE
-from .bambuser import BambuserIE, BambuserChannelIE
-from .bandcamp import BandcampIE, BandcampAlbumIE
-from .bbc import (
- BBCCoUkIE,
- BBCCoUkArticleIE,
- BBCIE,
-)
-from .beeg import BeegIE
-from .behindkink import BehindKinkIE
-from .beatportpro import BeatportProIE
-from .bet import BetIE
-from .bigflix import BigflixIE
-from .bild import BildIE
-from .bilibili import BiliBiliIE
-from .bleacherreport import (
- BleacherReportIE,
- BleacherReportCMSIE,
-)
-from .blinkx import BlinkxIE
-from .bloomberg import BloombergIE
-from .bpb import BpbIE
-from .br import BRIE
-from .breakcom import BreakIE
-from .brightcove import (
- BrightcoveLegacyIE,
- BrightcoveNewIE,
-)
-from .buzzfeed import BuzzFeedIE
-from .byutv import BYUtvIE
-from .c56 import C56IE
-from .camdemy import (
- CamdemyIE,
- CamdemyFolderIE
-)
-from .canalplus import CanalplusIE
-from .canalc2 import Canalc2IE
-from .canvas import CanvasIE
-from .cbc import (
- CBCIE,
- CBCPlayerIE,
-)
-from .cbs import CBSIE
-from .cbsnews import (
- CBSNewsIE,
- CBSNewsLiveVideoIE,
-)
-from .cbssports import CBSSportsIE
-from .ccc import CCCIE
-from .ceskatelevize import CeskaTelevizeIE
-from .channel9 import Channel9IE
-from .chaturbate import ChaturbateIE
-from .chilloutzone import ChilloutzoneIE
-from .chirbit import (
- ChirbitIE,
- ChirbitProfileIE,
-)
-from .cinchcast import CinchcastIE
-from .cinemassacre import CinemassacreIE
-from .clipfish import ClipfishIE
-from .cliphunter import CliphunterIE
-from .clipsyndicate import ClipsyndicateIE
-from .cloudy import CloudyIE
-from .clubic import ClubicIE
-from .clyp import ClypIE
-from .cmt import CMTIE
-from .cnet import CNETIE
-from .cnn import (
- CNNIE,
- CNNBlogsIE,
- CNNArticleIE,
-)
-from .collegehumor import CollegeHumorIE
-from .collegerama import CollegeRamaIE
-from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
-from .comcarcoff import ComCarCoffIE
-from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
-from .condenast import CondeNastIE
-from .cracked import CrackedIE
-from .crackle import CrackleIE
-from .criterion import CriterionIE
-from .crooksandliars import CrooksAndLiarsIE
-from .crunchyroll import (
- CrunchyrollIE,
- CrunchyrollShowPlaylistIE
-)
-from .cspan import CSpanIE
-from .ctsnews import CtsNewsIE
-from .cultureunplugged import CultureUnpluggedIE
-from .cwtv import CWTVIE
-from .dailymotion import (
- DailymotionIE,
- DailymotionPlaylistIE,
- DailymotionUserIE,
- DailymotionCloudIE,
-)
-from .daum import (
- DaumIE,
- DaumClipIE,
- DaumPlaylistIE,
- DaumUserIE,
-)
-from .dbtv import DBTVIE
-from .dcn import (
- DCNIE,
- DCNVideoIE,
- DCNLiveIE,
- DCNSeasonIE,
-)
-from .dctp import DctpTvIE
-from .deezer import DeezerPlaylistIE
-from .democracynow import DemocracynowIE
-from .dfb import DFBIE
-from .dhm import DHMIE
-from .dotsub import DotsubIE
-from .douyutv import DouyuTVIE
-from .dplay import DPlayIE
-from .dramafever import (
- DramaFeverIE,
- DramaFeverSeriesIE,
-)
-from .dreisat import DreiSatIE
-from .drbonanza import DRBonanzaIE
-from .drtuber import DrTuberIE
-from .drtv import DRTVIE
-from .dvtv import DVTVIE
-from .dump import DumpIE
-from .dumpert import DumpertIE
-from .defense import DefenseGouvFrIE
-from .discovery import DiscoveryIE
-from .dropbox import DropboxIE
-from .eagleplatform import EaglePlatformIE
-from .ebaumsworld import EbaumsWorldIE
-from .echomsk import EchoMskIE
-from .ehow import EHowIE
-from .eighttracks import EightTracksIE
-from .einthusan import EinthusanIE
-from .eitb import EitbIE
-from .ellentv import (
- EllenTVIE,
- EllenTVClipsIE,
-)
-from .elpais import ElPaisIE
-from .embedly import EmbedlyIE
-from .engadget import EngadgetIE
-from .eporner import EpornerIE
-from .eroprofile import EroProfileIE
-from .escapist import EscapistIE
-from .espn import ESPNIE
-from .esri import EsriVideoIE
-from .europa import EuropaIE
-from .everyonesmixtape import EveryonesMixtapeIE
-from .exfm import ExfmIE
-from .expotv import ExpoTVIE
-from .extremetube import ExtremeTubeIE
-from .facebook import (
- FacebookIE,
- FacebookPostIE,
-)
-from .faz import FazIE
-from .fc2 import FC2IE
-from .fczenit import FczenitIE
-from .firstpost import FirstpostIE
-from .firsttv import FirstTVIE
-from .fivemin import FiveMinIE
-from .fivetv import FiveTVIE
-from .fktv import FKTVIE
-from .flickr import FlickrIE
-from .folketinget import FolketingetIE
-from .footyroom import FootyRoomIE
-from .fourtube import FourTubeIE
-from .fox import FOXIE
-from .foxgay import FoxgayIE
-from .foxnews import FoxNewsIE
-from .foxsports import FoxSportsIE
-from .franceculture import (
- FranceCultureIE,
- FranceCultureEmissionIE,
-)
-from .franceinter import FranceInterIE
-from .francetv import (
- PluzzIE,
- FranceTvInfoIE,
- FranceTVIE,
- GenerationQuoiIE,
- CultureboxIE,
-)
-from .freesound import FreesoundIE
-from .freespeech import FreespeechIE
-from .freevideo import FreeVideoIE
-from .funimation import FunimationIE
-from .funnyordie import FunnyOrDieIE
-from .gameinformer import GameInformerIE
-from .gamekings import GamekingsIE
-from .gameone import (
- GameOneIE,
- GameOnePlaylistIE,
-)
-from .gamersyde import GamersydeIE
-from .gamespot import GameSpotIE
-from .gamestar import GameStarIE
-from .gametrailers import GametrailersIE
-from .gazeta import GazetaIE
-from .gdcvault import GDCVaultIE
-from .generic import GenericIE
-from .gfycat import GfycatIE
-from .giantbomb import GiantBombIE
-from .giga import GigaIE
-from .glide import GlideIE
-from .globo import (
- GloboIE,
- GloboArticleIE,
-)
-from .godtube import GodTubeIE
-from .goldenmoustache import GoldenMoustacheIE
-from .golem import GolemIE
-from .googledrive import GoogleDriveIE
-from .googleplus import GooglePlusIE
-from .googlesearch import GoogleSearchIE
-from .goshgay import GoshgayIE
-from .gputechconf import GPUTechConfIE
-from .groupon import GrouponIE
-from .hark import HarkIE
-from .hearthisat import HearThisAtIE
-from .heise import HeiseIE
-from .hellporno import HellPornoIE
-from .helsinki import HelsinkiIE
-from .hentaistigma import HentaiStigmaIE
-from .historicfilms import HistoricFilmsIE
-from .hitbox import HitboxIE, HitboxLiveIE
-from .hornbunny import HornBunnyIE
-from .hotnewhiphop import HotNewHipHopIE
-from .hotstar import HotStarIE
-from .howcast import HowcastIE
-from .howstuffworks import HowStuffWorksIE
-from .huffpost import HuffPostIE
-from .hypem import HypemIE
-from .iconosquare import IconosquareIE
-from .ign import (
- IGNIE,
- OneUPIE,
- PCMagIE,
-)
-from .imdb import (
- ImdbIE,
- ImdbListIE
-)
-from .imgur import (
- ImgurIE,
- ImgurAlbumIE,
-)
-from .ina import InaIE
-from .indavideo import (
- IndavideoIE,
- IndavideoEmbedIE,
-)
-from .infoq import InfoQIE
-from .instagram import InstagramIE, InstagramUserIE
-from .internetvideoarchive import InternetVideoArchiveIE
-from .iprima import IPrimaIE
-from .iqiyi import IqiyiIE
-from .ir90tv import Ir90TvIE
-from .ivi import (
- IviIE,
- IviCompilationIE
-)
-from .ivideon import IvideonIE
-from .izlesene import IzleseneIE
-from .jadorecettepub import JadoreCettePubIE
-from .jeuxvideo import JeuxVideoIE
-from .jove import JoveIE
-from .jwplatform import JWPlatformIE
-from .jpopsukitv import JpopsukiIE
-from .kaltura import KalturaIE
-from .kanalplay import KanalPlayIE
-from .kankan import KankanIE
-from .karaoketv import KaraoketvIE
-from .karrierevideos import KarriereVideosIE
-from .keezmovies import KeezMoviesIE
-from .khanacademy import KhanAcademyIE
-from .kickstarter import KickStarterIE
-from .keek import KeekIE
-from .konserthusetplay import KonserthusetPlayIE
-from .kontrtube import KontrTubeIE
-from .krasview import KrasViewIE
-from .ku6 import Ku6IE
-from .kuwo import (
- KuwoIE,
- KuwoAlbumIE,
- KuwoChartIE,
- KuwoSingerIE,
- KuwoCategoryIE,
- KuwoMvIE,
-)
-from .la7 import LA7IE
-from .laola1tv import Laola1TvIE
-from .lecture2go import Lecture2GoIE
-from .lemonde import LemondeIE
-from .letv import (
- LetvIE,
- LetvTvIE,
- LetvPlaylistIE,
- LetvCloudIE,
-)
-from .libsyn import LibsynIE
-from .lifenews import (
- LifeNewsIE,
- LifeEmbedIE,
-)
-from .limelight import (
- LimelightMediaIE,
- LimelightChannelIE,
- LimelightChannelListIE,
-)
-from .liveleak import LiveLeakIE
-from .livestream import (
- LivestreamIE,
- LivestreamOriginalIE,
- LivestreamShortenerIE,
-)
-from .lnkgo import LnkGoIE
-from .lovehomeporn import LoveHomePornIE
-from .lrt import LRTIE
-from .lynda import (
- LyndaIE,
- LyndaCourseIE
-)
-from .m6 import M6IE
-from .macgamestore import MacGameStoreIE
-from .mailru import MailRuIE
-from .makertv import MakerTVIE
-from .malemotion import MalemotionIE
-from .matchtv import MatchTVIE
-from .mdr import MDRIE
-from .metacafe import MetacafeIE
-from .metacritic import MetacriticIE
-from .mgoon import MgoonIE
-from .minhateca import MinhatecaIE
-from .ministrygrid import MinistryGridIE
-from .miomio import MioMioIE
-from .mit import TechTVMITIE, MITIE, OCWMITIE
-from .mitele import MiTeleIE
-from .mixcloud import MixcloudIE
-from .mlb import MLBIE
-from .mpora import MporaIE
-from .moevideo import MoeVideoIE
-from .mofosex import MofosexIE
-from .mojvideo import MojvideoIE
-from .moniker import MonikerIE
-from .mooshare import MooshareIE
-from .morningstar import MorningstarIE
-from .motherless import MotherlessIE
-from .motorsport import MotorsportIE
-from .movieclips import MovieClipsIE
-from .moviezine import MoviezineIE
-from .mtv import (
- MTVIE,
- MTVServicesEmbeddedIE,
- MTVIggyIE,
- MTVDEIE,
-)
-from .muenchentv import MuenchenTVIE
-from .musicplayon import MusicPlayOnIE
-from .muzu import MuzuTVIE
-from .mwave import MwaveIE
-from .myspace import MySpaceIE, MySpaceAlbumIE
-from .myspass import MySpassIE
-from .myvi import MyviIE
-from .myvideo import MyVideoIE
-from .myvidster import MyVidsterIE
-from .nationalgeographic import NationalGeographicIE
-from .naver import NaverIE
-from .nba import NBAIE
-from .nbc import (
- NBCIE,
- NBCNewsIE,
- NBCSportsIE,
- NBCSportsVPlayerIE,
- MSNBCIE,
-)
-from .ndr import (
- NDRIE,
- NJoyIE,
- NDREmbedBaseIE,
- NDREmbedIE,
- NJoyEmbedIE,
-)
-from .ndtv import NDTVIE
-from .netzkino import NetzkinoIE
-from .nerdcubed import NerdCubedFeedIE
-from .nerdist import NerdistIE
-from .neteasemusic import (
- NetEaseMusicIE,
- NetEaseMusicAlbumIE,
- NetEaseMusicSingerIE,
- NetEaseMusicListIE,
- NetEaseMusicMvIE,
- NetEaseMusicProgramIE,
- NetEaseMusicDjRadioIE,
-)
-from .newgrounds import NewgroundsIE
-from .newstube import NewstubeIE
-from .nextmedia import (
- NextMediaIE,
- NextMediaActionNewsIE,
- AppleDailyIE,
-)
-from .nextmovie import NextMovieIE
-from .nfb import NFBIE
-from .nfl import NFLIE
-from .nhl import (
- NHLIE,
- NHLNewsIE,
- NHLVideocenterIE,
-)
-from .nick import NickIE
-from .niconico import NiconicoIE, NiconicoPlaylistIE
-from .ninegag import NineGagIE
-from .noco import NocoIE
-from .normalboots import NormalbootsIE
-from .nosvideo import NosVideoIE
-from .nova import NovaIE
-from .novamov import (
- NovaMovIE,
- WholeCloudIE,
- NowVideoIE,
- VideoWeedIE,
- CloudTimeIE,
-)
-from .nowness import (
- NownessIE,
- NownessPlaylistIE,
- NownessSeriesIE,
-)
-from .nowtv import (
- NowTVIE,
- NowTVListIE,
-)
-from .noz import NozIE
-from .npo import (
- NPOIE,
- NPOLiveIE,
- NPORadioIE,
- NPORadioFragmentIE,
- SchoolTVIE,
- VPROIE,
- WNLIE
-)
-from .npr import NprIE
-from .nrk import (
- NRKIE,
- NRKPlaylistIE,
- NRKTVIE,
-)
-from .ntvde import NTVDeIE
-from .ntvru import NTVRuIE
-from .nytimes import (
- NYTimesIE,
- NYTimesArticleIE,
-)
-from .nuvid import NuvidIE
-from .odnoklassniki import OdnoklassnikiIE
-from .oktoberfesttv import OktoberfestTVIE
-from .onionstudios import OnionStudiosIE
-from .ooyala import (
- OoyalaIE,
- OoyalaExternalIE,
-)
-from .ora import OraTVIE
-from .orf import (
- ORFTVthekIE,
- ORFOE1IE,
- ORFFM4IE,
- ORFIPTVIE,
-)
-from .pandoratv import PandoraTVIE
-from .parliamentliveuk import ParliamentLiveUKIE
-from .patreon import PatreonIE
-from .pbs import PBSIE
-from .periscope import PeriscopeIE
-from .philharmoniedeparis import PhilharmonieDeParisIE
-from .phoenix import PhoenixIE
-from .photobucket import PhotobucketIE
-from .pinkbike import PinkbikeIE
-from .planetaplay import PlanetaPlayIE
-from .pladform import PladformIE
-from .played import PlayedIE
-from .playfm import PlayFMIE
-from .plays import PlaysTVIE
-from .playtvak import PlaytvakIE
-from .playvid import PlayvidIE
-from .playwire import PlaywireIE
-from .pluralsight import (
- PluralsightIE,
- PluralsightCourseIE,
-)
-from .podomatic import PodomaticIE
-from .porn91 import Porn91IE
-from .pornhd import PornHdIE
-from .pornhub import (
- PornHubIE,
- PornHubPlaylistIE,
- PornHubUserVideosIE,
-)
-from .pornotube import PornotubeIE
-from .pornovoisines import PornoVoisinesIE
-from .pornoxo import PornoXOIE
-from .primesharetv import PrimeShareTVIE
-from .promptfile import PromptFileIE
-from .prosiebensat1 import ProSiebenSat1IE
-from .puls4 import Puls4IE
-from .pyvideo import PyvideoIE
-from .qqmusic import (
- QQMusicIE,
- QQMusicSingerIE,
- QQMusicAlbumIE,
- QQMusicToplistIE,
- QQMusicPlaylistIE,
-)
-from .quickvid import QuickVidIE
-from .r7 import R7IE
-from .radiode import RadioDeIE
-from .radiojavan import RadioJavanIE
-from .radiobremen import RadioBremenIE
-from .radiofrance import RadioFranceIE
-from .rai import (
- RaiTVIE,
- RaiIE,
-)
-from .rbmaradio import RBMARadioIE
-from .rds import RDSIE
-from .redtube import RedTubeIE
-from .regiotv import RegioTVIE
-from .restudy import RestudyIE
-from .reverbnation import ReverbNationIE
-from .revision3 import Revision3IE
-from .ringtv import RingTVIE
-from .ro220 import Ro220IE
-from .rottentomatoes import RottenTomatoesIE
-from .roxwel import RoxwelIE
-from .rtbf import RTBFIE
-from .rte import RteIE, RteRadioIE
-from .rtlnl import RtlNlIE
-from .rtl2 import RTL2IE
-from .rtp import RTPIE
-from .rts import RTSIE
-from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE
-from .rtvnh import RTVNHIE
-from .ruhd import RUHDIE
-from .ruleporn import RulePornIE
-from .rutube import (
- RutubeIE,
- RutubeChannelIE,
- RutubeEmbedIE,
- RutubeMovieIE,
- RutubePersonIE,
-)
-from .rutv import RUTVIE
-from .ruutu import RuutuIE
-from .sandia import SandiaIE
-from .safari import (
- SafariIE,
- SafariCourseIE,
-)
-from .sapo import SapoIE
-from .savefrom import SaveFromIE
-from .sbs import SBSIE
-from .scivee import SciVeeIE
-from .screencast import ScreencastIE
-from .screencastomatic import ScreencastOMaticIE
-from .screenjunkies import ScreenJunkiesIE
-from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
-from .senateisvp import SenateISVPIE
-from .servingsys import ServingSysIE
-from .sexu import SexuIE
-from .sexykarma import SexyKarmaIE
-from .shahid import ShahidIE
-from .shared import SharedIE
-from .sharesix import ShareSixIE
-from .sina import SinaIE
-from .skynewsarabia import (
- SkyNewsArabiaIE,
- SkyNewsArabiaArticleIE,
-)
-from .slideshare import SlideshareIE
-from .slutload import SlutloadIE
-from .smotri import (
- SmotriIE,
- SmotriCommunityIE,
- SmotriUserIE,
- SmotriBroadcastIE,
-)
-from .snagfilms import (
- SnagFilmsIE,
- SnagFilmsEmbedIE,
-)
-from .snotr import SnotrIE
-from .sohu import SohuIE
-from .soundcloud import (
- SoundcloudIE,
- SoundcloudSetIE,
- SoundcloudUserIE,
- SoundcloudPlaylistIE,
- SoundcloudSearchIE
-)
-from .soundgasm import (
- SoundgasmIE,
- SoundgasmProfileIE
-)
-from .southpark import (
- SouthParkIE,
- SouthParkDeIE,
- SouthParkDkIE,
- SouthParkEsIE,
- SouthParkNlIE
-)
-from .space import SpaceIE
-from .spankbang import SpankBangIE
-from .spankwire import SpankwireIE
-from .spiegel import SpiegelIE, SpiegelArticleIE
-from .spiegeltv import SpiegeltvIE
-from .spike import SpikeIE
-from .stitcher import StitcherIE
-from .sport5 import Sport5IE
-from .sportbox import (
- SportBoxIE,
- SportBoxEmbedIE,
-)
-from .sportdeutschland import SportDeutschlandIE
-from .srgssr import (
- SRGSSRIE,
- SRGSSRPlayIE,
-)
-from .srmediathek import SRMediathekIE
-from .ssa import SSAIE
-from .stanfordoc import StanfordOpenClassroomIE
-from .steam import SteamIE
-from .streamcloud import StreamcloudIE
-from .streamcz import StreamCZIE
-from .streetvoice import StreetVoiceIE
-from .sunporno import SunPornoIE
-from .svt import (
- SVTIE,
- SVTPlayIE,
-)
-from .swrmediathek import SWRMediathekIE
-from .syfy import SyfyIE
-from .sztvhu import SztvHuIE
-from .tagesschau import TagesschauIE
-from .tapely import TapelyIE
-from .tass import TassIE
-from .teachertube import (
- TeacherTubeIE,
- TeacherTubeUserIE,
-)
-from .teachingchannel import TeachingChannelIE
-from .teamcoco import TeamcocoIE
-from .techtalks import TechTalksIE
-from .ted import TEDIE
-from .tele13 import Tele13IE
-from .telebruxelles import TeleBruxellesIE
-from .telecinco import TelecincoIE
-from .telegraaf import TelegraafIE
-from .telemb import TeleMBIE
-from .teletask import TeleTaskIE
-from .tenplay import TenPlayIE
-from .testurl import TestURLIE
-from .tf1 import TF1IE
-from .theintercept import TheInterceptIE
-from .theonion import TheOnionIE
-from .theplatform import (
- ThePlatformIE,
- ThePlatformFeedIE,
-)
-from .thesixtyone import TheSixtyOneIE
-from .thisamericanlife import ThisAmericanLifeIE
-from .thisav import ThisAVIE
-from .tinypic import TinyPicIE
-from .tlc import TlcDeIE
-from .tmz import (
- TMZIE,
- TMZArticleIE,
-)
-from .tnaflix import (
- TNAFlixIE,
- EMPFlixIE,
- MovieFapIE,
-)
-from .toggle import ToggleIE
-from .thvideo import (
- THVideoIE,
- THVideoPlaylistIE
-)
-from .toutv import TouTvIE
-from .toypics import ToypicsUserIE, ToypicsIE
-from .traileraddict import TrailerAddictIE
-from .trilulilu import TriluliluIE
-from .trollvids import TrollvidsIE
-from .trutube import TruTubeIE
-from .tube8 import Tube8IE
-from .tubitv import TubiTvIE
-from .tudou import (
- TudouIE,
- TudouPlaylistIE,
- TudouAlbumIE,
-)
-from .tumblr import TumblrIE
-from .tunein import (
- TuneInClipIE,
- TuneInStationIE,
- TuneInProgramIE,
- TuneInTopicIE,
- TuneInShortenerIE,
-)
-from .turbo import TurboIE
-from .tutv import TutvIE
-from .tv2 import (
- TV2IE,
- TV2ArticleIE,
-)
-from .tv4 import TV4IE
-from .tvc import (
- TVCIE,
- TVCArticleIE,
-)
-from .tvigle import TvigleIE
-from .tvland import TVLandIE
-from .tvp import TvpIE, TvpSeriesIE
-from .tvplay import TVPlayIE
-from .tweakers import TweakersIE
-from .twentyfourvideo import TwentyFourVideoIE
-from .twentymin import TwentyMinutenIE
-from .twentytwotracks import (
- TwentyTwoTracksIE,
- TwentyTwoTracksGenreIE
-)
-from .twitch import (
- TwitchVideoIE,
- TwitchChapterIE,
- TwitchVodIE,
- TwitchProfileIE,
- TwitchPastBroadcastsIE,
- TwitchBookmarksIE,
- TwitchStreamIE,
-)
-from .twitter import (
- TwitterCardIE,
- TwitterIE,
- TwitterAmplifyIE,
-)
-from .ubu import UbuIE
-from .udemy import (
- UdemyIE,
- UdemyCourseIE
-)
-from .udn import UDNEmbedIE
-from .digiteka import DigitekaIE
-from .unistra import UnistraIE
-from .urort import UrortIE
-from .ustream import UstreamIE, UstreamChannelIE
-from .varzesh3 import Varzesh3IE
-from .vbox7 import Vbox7IE
-from .veehd import VeeHDIE
-from .veoh import VeohIE
-from .vessel import VesselIE
-from .vesti import VestiIE
-from .vevo import VevoIE
-from .vgtv import (
- BTArticleIE,
- BTVestlendingenIE,
- VGTVIE,
-)
-from .vh1 import VH1IE
-from .vice import ViceIE
-from .viddler import ViddlerIE
-from .videodetective import VideoDetectiveIE
-from .videofyme import VideofyMeIE
-from .videomega import VideoMegaIE
-from .videomore import (
- VideomoreIE,
- VideomoreVideoIE,
- VideomoreSeasonIE,
-)
-from .videopremium import VideoPremiumIE
-from .videott import VideoTtIE
-from .vidme import (
- VidmeIE,
- VidmeUserIE,
- VidmeUserLikesIE,
-)
-from .vidzi import VidziIE
-from .vier import VierIE, VierVideosIE
-from .viewster import ViewsterIE
-from .viidea import ViideaIE
-from .vimeo import (
- VimeoIE,
- VimeoAlbumIE,
- VimeoChannelIE,
- VimeoGroupsIE,
- VimeoLikesIE,
- VimeoReviewIE,
- VimeoUserIE,
- VimeoWatchLaterIE,
-)
-from .vimple import VimpleIE
-from .vine import (
- VineIE,
- VineUserIE,
-)
-from .viki import (
- VikiIE,
- VikiChannelIE,
-)
-from .vk import (
- VKIE,
- VKUserVideosIE,
-)
-from .vlive import VLiveIE
-from .vodlocker import VodlockerIE
-from .voicerepublic import VoiceRepublicIE
-from .vporn import VpornIE
-from .vrt import VRTIE
-from .vube import VubeIE
-from .vuclip import VuClipIE
-from .vulture import VultureIE
-from .walla import WallaIE
-from .washingtonpost import WashingtonPostIE
-from .wat import WatIE
-from .wayofthemaster import WayOfTheMasterIE
-from .wdr import (
- WDRIE,
- WDRMobileIE,
- WDRMausIE,
-)
-from .webofstories import (
- WebOfStoriesIE,
- WebOfStoriesPlaylistIE,
-)
-from .weibo import WeiboIE
-from .weiqitv import WeiqiTVIE
-from .wimp import WimpIE
-from .wistia import WistiaIE
-from .worldstarhiphop import WorldStarHipHopIE
-from .wrzuta import WrzutaIE
-from .wsj import WSJIE
-from .xbef import XBefIE
-from .xboxclips import XboxClipsIE
-from .xfileshare import XFileShareIE
-from .xhamster import (
- XHamsterIE,
- XHamsterEmbedIE,
-)
-from .xminus import XMinusIE
-from .xnxx import XNXXIE
-from .xstream import XstreamIE
-from .xtube import XTubeUserIE, XTubeIE
-from .xuite import XuiteIE
-from .xvideos import XVideosIE
-from .xxxymovies import XXXYMoviesIE
-from .yahoo import (
- YahooIE,
- YahooSearchIE,
-)
-from .yam import YamIE
-from .yandexmusic import (
- YandexMusicTrackIE,
- YandexMusicAlbumIE,
- YandexMusicPlaylistIE,
-)
-from .yesjapan import YesJapanIE
-from .yinyuetai import YinYueTaiIE
-from .ynet import YnetIE
-from .youjizz import YouJizzIE
-from .youku import YoukuIE
-from .youporn import YouPornIE
-from .yourupload import YourUploadIE
-from .youtube import (
- YoutubeIE,
- YoutubeChannelIE,
- YoutubeFavouritesIE,
- YoutubeHistoryIE,
- YoutubePlaylistIE,
- YoutubeRecommendedIE,
- YoutubeSearchDateIE,
- YoutubeSearchIE,
- YoutubeSearchURLIE,
- YoutubeShowIE,
- YoutubeSubscriptionsIE,
- YoutubeTruncatedIDIE,
- YoutubeTruncatedURLIE,
- YoutubeUserIE,
- YoutubePlaylistsIE,
- YoutubeWatchLaterIE,
-)
-from .zapiks import ZapiksIE
-from .zdf import ZDFIE, ZDFChannelIE
-from .zingmp3 import (
- ZingMp3SongIE,
- ZingMp3AlbumIE,
-)
-from .zippcast import ZippCastIE
-
-_ALL_CLASSES = [
- klass
- for name, klass in globals().items()
- if name.endswith('IE') and name != 'GenericIE'
-]
-_ALL_CLASSES.append(GenericIE)
+try:
+ from .lazy_extractors import *
+ from .lazy_extractors import _ALL_CLASSES
+ _LAZY_LOADER = True
+except ImportError:
+ _LAZY_LOADER = False
+ from .extractors import *
+
+ _ALL_CLASSES = [
+ klass
+ for name, klass in globals().items()
+ if name.endswith('IE') and name != 'GenericIE'
+ ]
+ _ALL_CLASSES.append(GenericIE)
+
+
+def gen_extractor_classes():
+ """ Return a list of supported extractors.
+ The order does matter; the first extractor matched is the one handling the URL.
+ """
+ return _ALL_CLASSES
def gen_extractors():
""" Return a list of an instance of every supported extractor.
The order does matter; the first extractor matched is the one handling the URL.
"""
- return [klass() for klass in _ALL_CLASSES]
+ return [klass() for klass in gen_extractor_classes()]
def list_extractors(age_limit):
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index 6a29e58..b584277 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -12,7 +12,7 @@ from ..utils import (
class ABCIE(InfoExtractor):
IE_NAME = 'abc.net.au'
- _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',
diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py
index 122dc90..c04949c 100644
--- a/youtube_dl/extractor/abc7news.py
+++ b/youtube_dl/extractor/abc7news.py
@@ -44,6 +44,7 @@ class Abc7NewsIE(InfoExtractor):
'contentURL', webpage, 'm3u8 url', fatal=True)
formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4')
+ self._sort_formats(formats)
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py
new file mode 100644
index 0000000..b61a632
--- /dev/null
+++ b/youtube_dl/extractor/abcnews.py
@@ -0,0 +1,135 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import re
+import time
+
+from .amp import AMPIE
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+
+
+class AbcNewsVideoIE(AMPIE):
+ IE_NAME = 'abcnews:video'
+ _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
+ 'info_dict': {
+ 'id': '20411932',
+ 'ext': 'mp4',
+ 'display_id': 'week-exclusive-irans-foreign-minister-zarif',
+ 'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
+ 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
+ 'duration': 180,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+ video_id = mobj.group('id')
+ info_dict = self._extract_feed_info(
+ 'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
+ info_dict.update({
+ 'id': video_id,
+ 'display_id': display_id,
+ })
+ return info_dict
+
+
+class AbcNewsIE(InfoExtractor):
+ IE_NAME = 'abcnews'
+ _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
+ 'info_dict': {
+ 'id': '10498713',
+ 'ext': 'flv',
+ 'display_id': 'dramatic-video-rare-death-job-america',
+ 'title': 'Occupational Hazards',
+ 'description': 'Nightline investigates the dangers that lurk at various jobs.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20100428',
+ 'timestamp': 1272412800,
+ },
+ 'add_ie': ['AbcNewsVideo'],
+ }, {
+ 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
+ 'info_dict': {
+ 'id': '39125818',
+ 'ext': 'mp4',
+ 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
+ 'title': 'Justin Timberlake Drops Hints For Secret Single',
+ 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
+ 'upload_date': '20160515',
+ 'timestamp': 1463329500,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ # The embedded YouTube video is blocked due to copyright issues
+ 'playlist_items': '1',
+ },
+ 'add_ie': ['AbcNewsVideo'],
+ }, {
+ 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('display_id')
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._search_regex(
+ r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
+ full_video_url = compat_urlparse.urljoin(url, video_url)
+
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
+ webpage, 'YouTube URL', default=None)
+
+ timestamp = None
+ date_str = self._html_search_regex(
+ r'<span[^>]+class="timestamp">([^<]+)</span>',
+ webpage, 'timestamp', fatal=False)
+ if date_str:
+ tz_offset = 0
+ if date_str.endswith(' ET'): # Eastern Time
+ tz_offset = -5
+ date_str = date_str[:-3]
+ date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
+ for date_format in date_formats:
+ try:
+ timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
+ except ValueError:
+ continue
+ if timestamp is not None:
+ timestamp -= tz_offset * 3600
+
+ entry = {
+ '_type': 'url_transparent',
+ 'ie_key': AbcNewsVideoIE.ie_key(),
+ 'url': full_video_url,
+ 'id': video_id,
+ 'display_id': display_id,
+ 'timestamp': timestamp,
+ }
+
+ if youtube_url:
+ entries = [entry, self.url_result(youtube_url, 'Youtube')]
+ return self.playlist_result(entries)
+
+ return entry
diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py
index 92eee81..94ce88c 100644
--- a/youtube_dl/extractor/acast.py
+++ b/youtube_dl/extractor/acast.py
@@ -2,10 +2,14 @@
from __future__ import unicode_literals
import re
+import functools
from .common import InfoExtractor
from ..compat import compat_str
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ OnDemandPagedList,
+)
class ACastIE(InfoExtractor):
@@ -26,13 +30,8 @@ class ACastIE(InfoExtractor):
def _real_extract(self, url):
channel, display_id = re.match(self._VALID_URL, url).groups()
-
- embed_page = self._download_webpage(
- re.sub('(?:www\.)?acast\.com', 'embedcdn.acast.com', url), display_id)
- cast_data = self._parse_json(self._search_regex(
- r'window\[\'acast/queries\'\]\s*=\s*([^;]+);', embed_page, 'acast data'),
- display_id)['GetAcast/%s/%s' % (channel, display_id)]
-
+ cast_data = self._download_json(
+ 'https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id)
return {
'id': compat_str(cast_data['id']),
'display_id': display_id,
@@ -58,15 +57,26 @@ class ACastChannelIE(InfoExtractor):
'playlist_mincount': 20,
}
_API_BASE_URL = 'https://www.acast.com/api/'
+ _PAGE_SIZE = 10
@classmethod
def suitable(cls, url):
return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
- def _real_extract(self, url):
- display_id = self._match_id(url)
- channel_data = self._download_json(self._API_BASE_URL + 'channels/%s' % display_id, display_id)
- casts = self._download_json(self._API_BASE_URL + 'channels/%s/acasts' % display_id, display_id)
- entries = [self.url_result('https://www.acast.com/%s/%s' % (display_id, cast['url']), 'ACast') for cast in casts]
+ def _fetch_page(self, channel_slug, page):
+ casts = self._download_json(
+ self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page),
+ channel_slug, note='Download page %d of channel data' % page)
+ for cast in casts:
+ yield self.url_result(
+ 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']),
+ 'ACast', cast['id'])
- return self.playlist_result(entries, compat_str(channel_data['id']), channel_data['name'], channel_data.get('description'))
+ def _real_extract(self, url):
+ channel_slug = self._match_id(url)
+ channel_data = self._download_json(
+ self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug)
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, channel_slug), self._PAGE_SIZE)
+ return self.playlist_result(entries, compat_str(
+ channel_data['id']), channel_data['name'], channel_data.get('description'))
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
index e3e6d21..55a9322 100644
--- a/youtube_dl/extractor/addanime.py
+++ b/youtube_dl/extractor/addanime.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
)
from ..utils import (
@@ -16,7 +16,7 @@ from ..utils import (
class AddAnimeIE(InfoExtractor):
- _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)'
+ _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)'
_TESTS = [{
'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
'md5': '72954ea10bc979ab5e2eb288b21425a0',
@@ -60,7 +60,7 @@ class AddAnimeIE(InfoExtractor):
confirm_url = (
parsed_url.scheme + '://' + parsed_url.netloc +
action + '?' +
- compat_urllib_parse.urlencode({
+ compat_urllib_parse_urlencode({
'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
self._download_webpage(
confirm_url, video_id,
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
index 8753ee2..5ae16fa 100644
--- a/youtube_dl/extractor/adobetv.py
+++ b/youtube_dl/extractor/adobetv.py
@@ -156,7 +156,10 @@ class AdobeTVVideoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- video_data = self._download_json(url + '?format=json', video_id)
+ webpage = self._download_webpage(url, video_id)
+
+ video_data = self._parse_json(self._search_regex(
+ r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
formats = [{
'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')),
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index 6018ae7..1bbfe26 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -1,13 +1,19 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..utils import (
+ smuggle_url,
+ update_url_query,
+ unescapeHTML,
+)
class AENetworksIE(InfoExtractor):
IE_NAME = 'aenetworks'
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
- _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
+ _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
_TESTS = [{
'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
@@ -16,6 +22,9 @@ class AENetworksIE(InfoExtractor):
'ext': 'mp4',
'title': "Bet You Didn't Know: Valentine's Day",
'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+ 'timestamp': 1375819729,
+ 'upload_date': '20130806',
+ 'uploader': 'AENE-NEW',
},
'params': {
# m3u8 download
@@ -25,15 +34,15 @@ class AENetworksIE(InfoExtractor):
'expected_warnings': ['JSON-LD'],
}, {
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
+ 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',
'info_dict': {
'id': 'eg47EERs_JsZ',
'ext': 'mp4',
'title': 'Winter Is Coming',
'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'timestamp': 1338306241,
+ 'upload_date': '20120529',
+ 'uploader': 'AENE-NEW',
},
'add_ie': ['ThePlatform'],
}, {
@@ -48,7 +57,7 @@ class AENetworksIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ page_type, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
@@ -56,11 +65,23 @@ class AENetworksIE(InfoExtractor):
r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
r"media_url\s*=\s*'([^']+)'"
]
- video_url = self._search_regex(video_url_re, webpage, 'video url')
+ video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url'))
+ query = {'mbr': 'true'}
+ if page_type == 'shows':
+ query['assetTypes'] = 'medium_video_s3'
+ if 'switch=hds' in video_url:
+ query['switch'] = 'hls'
info = self._search_json_ld(webpage, video_id, fatal=False)
info.update({
'_type': 'url_transparent',
- 'url': smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}),
+ 'url': smuggle_url(
+ update_url_query(video_url, query),
+ {
+ 'sig': {
+ 'key': 'crazyjava',
+ 'secret': 's3cr3t'},
+ 'force_smil_url': True
+ }),
})
return info
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
new file mode 100644
index 0000000..518c61f
--- /dev/null
+++ b/youtube_dl/extractor/afreecatv.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ xpath_element,
+ xpath_text,
+)
+
+
+class AfreecaTVIE(InfoExtractor):
+ IE_DESC = 'afreecatv.com'
+ _VALID_URL = r'''(?x)^
+ https?://(?:(live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
+ (?:
+ /app/(?:index|read_ucc_bbs)\.cgi|
+ /player/[Pp]layer\.(?:swf|html))
+ \?.*?\bnTitleNo=(?P<id>\d+)'''
+ _TESTS = [{
+ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
+ 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
+ 'info_dict': {
+ 'id': '36164052',
+ 'ext': 'mp4',
+ 'title': '데일리 에이프릴 요정들의 시상식!',
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'upload_date': '20160503',
+ }
+ }, {
+ 'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',
+ 'info_dict': {
+ 'id': '36153164',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ },
+ 'playlist_count': 2,
+ 'playlist': [{
+ 'md5': 'd8b7c174568da61d774ef0203159bf97',
+ 'info_dict': {
+ 'id': '36153164_1',
+ 'ext': 'mp4',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+ 'upload_date': '20160502',
+ },
+ }, {
+ 'md5': '58f2ce7f6044e34439ab2d50612ab02b',
+ 'info_dict': {
+ 'id': '36153164_2',
+ 'ext': 'mp4',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+ 'upload_date': '20160502',
+ },
+ }],
+ }, {
+ 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def parse_video_key(key):
+ video_key = {}
+ m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
+ if m:
+ video_key['upload_date'] = m.group('upload_date')
+ video_key['part'] = m.group('part')
+ return video_key
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ parsed_url = compat_urllib_parse_urlparse(url)
+ info_url = compat_urlparse.urlunparse(parsed_url._replace(
+ netloc='afbbs.afreecatv.com:8080',
+ path='/api/video/get_video_info.php'))
+ video_xml = self._download_xml(info_url, video_id)
+
+ if xpath_element(video_xml, './track/video/file') is None:
+ raise ExtractorError('Specified AfreecaTV video does not exist',
+ expected=True)
+
+ title = xpath_text(video_xml, './track/title', 'title')
+ uploader = xpath_text(video_xml, './track/nickname', 'uploader')
+ uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
+ duration = int_or_none(xpath_text(video_xml, './track/duration',
+ 'duration'))
+ thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
+
+ entries = []
+ for i, video_file in enumerate(video_xml.findall('./track/video/file')):
+ video_key = self.parse_video_key(video_file.get('key', ''))
+ if not video_key:
+ continue
+ entries.append({
+ 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)),
+ 'title': title,
+ 'upload_date': video_key.get('upload_date'),
+ 'duration': int_or_none(video_file.get('duration')),
+ 'url': video_file.text,
+ })
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ }
+
+ if len(entries) > 1:
+ info['_type'] = 'multi_video'
+ info['entries'] = entries
+ elif len(entries) == 1:
+ info['url'] = entries[0]['url']
+ info['upload_date'] = entries[0].get('upload_date')
+ else:
+ raise ExtractorError(
+ 'No files found for the specified AfreecaTV video, either'
+ ' the URL is incorrect or the video has been made private.',
+ expected=True)
+
+ return info
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
index e0518cf..5766b4f 100644
--- a/youtube_dl/extractor/aftonbladet.py
+++ b/youtube_dl/extractor/aftonbladet.py
@@ -6,7 +6,7 @@ from ..utils import int_or_none
class AftonbladetIE(InfoExtractor):
- _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'info_dict': {
@@ -24,10 +24,10 @@ class AftonbladetIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
# find internal video meta data
- meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
+ meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json'
player_config = self._parse_json(self._html_search_regex(
r'data-player-config="([^"]+)"', webpage, 'player config'), video_id)
- internal_meta_id = player_config['videoId']
+ internal_meta_id = player_config['aptomaVideoId']
internal_meta_url = meta_url % internal_meta_id
internal_meta_json = self._download_json(
internal_meta_url, video_id, 'Downloading video meta data')
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
index 5b2c0dc..b081695 100644
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class AlJazeeraIE(InfoExtractor):
- _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+ _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
_TEST = {
'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
@@ -13,24 +13,18 @@ class AlJazeeraIE(InfoExtractor):
'ext': 'mp4',
'title': 'The Slum - Episode 1: Deliverance',
'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.',
- 'uploader': 'Al Jazeera English',
+ 'uploader_id': '665003303001',
+ 'timestamp': 1411116829,
+ 'upload_date': '20140919',
},
- 'add_ie': ['BrightcoveLegacy'],
+ 'add_ie': ['BrightcoveNew'],
'skip': 'Not accessible from Travis CI server',
}
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
program_name = self._match_id(url)
webpage = self._download_webpage(url, program_name)
brightcove_id = self._search_regex(
r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
-
- return {
- '_type': 'url',
- 'url': (
- 'brightcove:'
- 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc'
- '&%40videoPlayer={0}'.format(brightcove_id)
- ),
- 'ie_key': 'BrightcoveLegacy',
- }
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index 69e6baf..8545681 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -52,7 +52,7 @@ class AMPIE(InfoExtractor):
for media_data in media_content:
media = media_data['@attributes']
media_type = media['type']
- if media_type == 'video/f4m':
+ if media_type in ('video/f4m', 'application/f4m+xml'):
formats.extend(self._extract_f4m_formats(
media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
video_id, f4m_id='hds', fatal=False))
@@ -61,7 +61,7 @@ class AMPIE(InfoExtractor):
media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.append({
- 'format_id': media_data['media-category']['@attributes']['label'],
+ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'],
'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')),
@@ -69,12 +69,14 @@ class AMPIE(InfoExtractor):
self._sort_formats(formats)
+ timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
+
return {
'id': video_id,
'title': get_media_node('title'),
'description': get_media_node('description'),
'thumbnails': thumbnails,
- 'timestamp': parse_iso8601(item.get('pubDate'), ' '),
+ 'timestamp': timestamp,
'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
'subtitles': subtitles,
'formats': formats,
diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py
index a7d8daf..9b01e38 100644
--- a/youtube_dl/extractor/animeondemand.py
+++ b/youtube_dl/extractor/animeondemand.py
@@ -3,10 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_str,
+)
from ..utils import (
determine_ext,
- encode_dict,
+ extract_attributes,
ExtractorError,
sanitized_Request,
urlencode_postdata,
@@ -18,7 +21,7 @@ class AnimeOnDemandIE(InfoExtractor):
_LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand'
- _TEST = {
+ _TESTS = [{
'url': 'https://www.anime-on-demand.de/anime/161',
'info_dict': {
'id': '161',
@@ -26,7 +29,19 @@ class AnimeOnDemandIE(InfoExtractor):
'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
},
'playlist_mincount': 4,
- }
+ }, {
+ # Film wording is used instead of Episode
+ 'url': 'https://www.anime-on-demand.de/anime/39',
+ 'only_matching': True,
+ }, {
+ # Episodes without titles
+ 'url': 'https://www.anime-on-demand.de/anime/162',
+ 'only_matching': True,
+ }, {
+ # ger/jap, Dub/OmU, account required
+ 'url': 'https://www.anime-on-demand.de/anime/169',
+ 'only_matching': True,
+ }]
def _login(self):
(username, password) = self._get_login_info()
@@ -36,6 +51,10 @@ class AnimeOnDemandIE(InfoExtractor):
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
+ if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page:
+ self.raise_geo_restricted(
+ '%s is only available in German-speaking countries of Europe' % self.IE_NAME)
+
login_form = self._form_hidden_inputs('new_user', login_page)
login_form.update({
@@ -51,7 +70,7 @@ class AnimeOnDemandIE(InfoExtractor):
post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
request = sanitized_Request(
- post_url, urlencode_postdata(encode_dict(login_form)))
+ post_url, urlencode_postdata(login_form))
request.add_header('Referer', self._LOGIN_URL)
response = self._download_webpage(
@@ -91,14 +110,22 @@ class AnimeOnDemandIE(InfoExtractor):
entries = []
- for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage):
- m = re.search(
- r'class="episodebox-title"[^>]+title="Episode (?P<number>\d+) - (?P<title>.+?)"', episode_html)
- if not m:
+ for num, episode_html in enumerate(re.findall(
+ r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1):
+ episodebox_title = self._search_regex(
+ (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
+ r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
+ episode_html, 'episodebox title', default=None, group='title')
+ if not episodebox_title:
continue
- episode_number = int(m.group('number'))
- episode_title = m.group('title')
+ episode_number = int(self._search_regex(
+ r'(?:Episode|Film)\s*(\d+)',
+ episodebox_title, 'episode number', default=num))
+ episode_title = self._search_regex(
+ r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
+ episodebox_title, 'episode title', default=None)
+
video_id = 'episode-%d' % episode_number
common_info = {
@@ -110,33 +137,86 @@ class AnimeOnDemandIE(InfoExtractor):
formats = []
- playlist_url = self._search_regex(
- r'data-playlist=(["\'])(?P<url>.+?)\1',
- episode_html, 'data playlist', default=None, group='url')
- if playlist_url:
- request = sanitized_Request(
- compat_urlparse.urljoin(url, playlist_url),
- headers={
- 'X-Requested-With': 'XMLHttpRequest',
- 'X-CSRF-Token': csrf_token,
- 'Referer': url,
- 'Accept': 'application/json, text/javascript, */*; q=0.01',
- })
-
- playlist = self._download_json(
- request, video_id, 'Downloading playlist JSON', fatal=False)
- if playlist:
- playlist = playlist['playlist'][0]
- title = playlist['title']
+ for input_ in re.findall(
+ r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html):
+ attributes = extract_attributes(input_)
+ playlist_urls = []
+ for playlist_key in ('data-playlist', 'data-otherplaylist'):
+ playlist_url = attributes.get(playlist_key)
+ if isinstance(playlist_url, compat_str) and re.match(
+ r'/?[\da-zA-Z]+', playlist_url):
+ playlist_urls.append(attributes[playlist_key])
+ if not playlist_urls:
+ continue
+
+ lang = attributes.get('data-lang')
+ lang_note = attributes.get('value')
+
+ for playlist_url in playlist_urls:
+ kind = self._search_regex(
+ r'videomaterialurl/\d+/([^/]+)/',
+ playlist_url, 'media kind', default=None)
+ format_id_list = []
+ if lang:
+ format_id_list.append(lang)
+ if kind:
+ format_id_list.append(kind)
+ if not format_id_list:
+ format_id_list.append(compat_str(num))
+ format_id = '-'.join(format_id_list)
+ format_note = ', '.join(filter(None, (kind, lang_note)))
+ request = sanitized_Request(
+ compat_urlparse.urljoin(url, playlist_url),
+ headers={
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'X-CSRF-Token': csrf_token,
+ 'Referer': url,
+ 'Accept': 'application/json, text/javascript, */*; q=0.01',
+ })
+ playlist = self._download_json(
+ request, video_id, 'Downloading %s playlist JSON' % format_id,
+ fatal=False)
+ if not playlist:
+ continue
+ start_video = playlist.get('startvideo', 0)
+ playlist = playlist.get('playlist')
+ if not playlist or not isinstance(playlist, list):
+ continue
+ playlist = playlist[start_video]
+ title = playlist.get('title')
+ if not title:
+ continue
description = playlist.get('description')
for source in playlist.get('sources', []):
file_ = source.get('file')
- if file_ and determine_ext(file_) == 'm3u8':
- formats = self._extract_m3u8_formats(
+ if not file_:
+ continue
+ ext = determine_ext(file_)
+ format_id_list = [lang, kind]
+ if ext == 'm3u8':
+ format_id_list.append('hls')
+ elif source.get('type') == 'video/dash' or ext == 'mpd':
+ format_id_list.append('dash')
+ format_id = '-'.join(filter(None, format_id_list))
+ if ext == 'm3u8':
+ file_formats = self._extract_m3u8_formats(
file_, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls')
+ entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)
+ elif source.get('type') == 'video/dash' or ext == 'mpd':
+ continue
+ file_formats = self._extract_mpd_formats(
+ file_, video_id, mpd_id=format_id, fatal=False)
+ else:
+ continue
+ for f in file_formats:
+ f.update({
+ 'language': lang,
+ 'format_note': format_note,
+ })
+ formats.extend(file_formats)
if formats:
+ self._sort_formats(formats)
f = common_info.copy()
f.update({
'title': title,
@@ -145,16 +225,18 @@ class AnimeOnDemandIE(InfoExtractor):
})
entries.append(f)
- m = re.search(
- r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<',
- episode_html)
- if m:
- f = common_info.copy()
- f.update({
- 'id': '%s-teaser' % f['id'],
- 'title': m.group('title'),
- 'url': compat_urlparse.urljoin(url, m.group('href')),
- })
- entries.append(f)
+ # Extract teaser only when full episode is not available
+ if not formats:
+ m = re.search(
+ r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<',
+ episode_html)
+ if m:
+ f = common_info.copy()
+ f.update({
+ 'id': '%s-teaser' % f['id'],
+ 'title': m.group('title'),
+ 'url': compat_urlparse.urljoin(url, m.group('href')),
+ })
+ entries.append(f)
return self.playlist_result(entries, anime_id, anime_title, anime_description)
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py
new file mode 100644
index 0000000..cb29cf1
--- /dev/null
+++ b/youtube_dl/extractor/anvato.py
@@ -0,0 +1,224 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import hashlib
+import json
+import random
+import time
+
+from .common import InfoExtractor
+from ..aes import aes_encrypt
+from ..compat import compat_str
+from ..utils import (
+ bytes_to_intlist,
+ determine_ext,
+ intlist_to_bytes,
+ int_or_none,
+ strip_jsonp,
+)
+
+
+def md5_text(s):
+ if not isinstance(s, compat_str):
+ s = compat_str(s)
+ return hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+class AnvatoIE(InfoExtractor):
+ # Copied from anvplayer.min.js
+ _ANVACK_TABLE = {
+ 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
+ 'nbcu_nbcd_desktop_web_qa_1a6f01bdd0dc45a439043b694c8a031d': 'eSxJUbA2UUKBTXryyQ2d6NuM8oEqaPySvaPzfKNA',
+ 'nbcu_nbcd_desktop_web_acc_eb2ff240a5d4ae9a63d4c297c32716b6c523a129': '89JR3RtUGbvKuuJIiKOMK0SoarLb5MUx8v89RcbP',
+ 'nbcu_nbcd_watchvod_web_prod_e61107507180976724ec8e8319fe24ba5b4b60e1': 'Uc7dFt7MJ9GsBWB5T7iPvLaMSOt8BBxv4hAXk5vv',
+ 'nbcu_nbcd_watchvod_web_qa_42afedba88a36203db5a4c09a5ba29d045302232': 'T12oDYVFP2IaFvxkmYMy5dKxswpLHtGZa4ZAXEi7',
+ 'nbcu_nbcd_watchvod_web_acc_9193214448e2e636b0ffb78abacfd9c4f937c6ca': 'MmobcxUxMedUpohNWwXaOnMjlbiyTOBLL6d46ZpR',
+ 'nbcu_local_monitor_web_acc_f998ad54eaf26acd8ee033eb36f39a7b791c6335': 'QvfIoPYrwsjUCcASiw3AIkVtQob2LtJHfidp9iWg',
+ 'nbcu_cable_monitor_web_acc_a413759603e8bedfcd3c61b14767796e17834077': 'uwVPJLShvJWSs6sWEIuVem7MTF8A4IknMMzIlFto',
+ 'nbcu_nbcd_mcpstage_web_qa_4c43a8f6e95a88dbb40276c0630ba9f693a63a4e': 'PxVYZVwjhgd5TeoPRxL3whssb5OUPnM3zyAzq8GY',
+ 'nbcu_comcast_comcast_web_prod_074080762ad4ce956b26b43fb22abf153443a8c4': 'afnaRZfDyg1Z3WZHdupKfy6xrbAG2MHqe3VfuSwh',
+ 'nbcu_comcast_comcast_web_qa_706103bb93ead3ef70b1de12a0e95e3c4481ade0': 'DcjsVbX9b3uoPlhdriIiovgFQZVxpISZwz0cx1ZK',
+ 'nbcu_comcast_comcastcable_web_prod_669f04817536743563d7331c9293e59fbdbe3d07': '0RwMN2cWy10qhAhOscq3eK7aEe0wqnKt3vJ0WS4D',
+ 'nbcu_comcast_comcastcable_web_qa_3d9d2d66219094127f0f6b09cc3c7bb076e3e1ca': '2r8G9DEya7PCqBceKZgrn2XkXgASjwLMuaFE1Aad',
+ 'hearst_hearst_demo_web_stage_960726dfef3337059a01a78816e43b29ec04dfc7': 'cuZBPXTR6kSdoTCVXwk5KGA8rk3NrgGn4H6e9Dsp',
+ 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922': 'IOaaLQ8ymqVyem14QuAvE5SndQynTcH5CrLkU2Ih',
+ 'anvato_nextmedia_demo_web_stage_9787d56a02ff6b9f43e9a2b0920d8ca88beb5818': 'Pqu9zVzI1ApiIzbVA3VkGBEQHvdKSUuKpD6s2uaR',
+ 'anvato_scripps_app_web_prod_0837996dbe373629133857ae9eb72e740424d80a': 'du1ccmn7RxzgizwbWU7hyUaGodNlJn7HtXI0WgXW',
+ 'anvato_scripps_app_web_stage_360797e00fe2826be142155c4618cc52fce6c26c': '2PMrQ0BRoqCWl7nzphj0GouIMEh2mZYivAT0S1Su',
+ 'fs2go_fs2go_go_all_prod_21934911ccfafc03a075894ead2260d11e2ddd24': 'RcuHlKikW2IJw6HvVoEkqq2UsuEJlbEl11pWXs4Q',
+ 'fs2go_fs2go_go_web_prod_ead4b0eec7460c1a07783808db21b49cf1f2f9a7': '4K0HTT2u1zkQA2MaGaZmkLa1BthGSBdr7jllrhk5',
+ 'fs2go_fs2go_go_web_stage_407585454a4400355d4391691c67f361': 'ftnc37VKRJBmHfoGGi3kT05bHyeJzilEzhKJCyl3',
+ 'fs2go_fs2go_go_android_stage_44b714db6f8477f29afcba15a41e1d30': 'CtxpPvVpo6AbZGomYUhkKs7juHZwNml9b9J0J2gI',
+ 'anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67': 'Pw0XX5KBDsyRnPS0R2JrSrXftsy8Jnz5pAjaYC8s',
+ 'anvato_cbslocal_app_web_stage_547a5f096594cd3e00620c6f825cad1096d28c80': '37OBUhX2uwNyKhhrNzSSNHSRPZpApC3trdqDBpuz',
+ 'fs2go_att_att_web_prod_1042dddd089a05438b6a08f972941176f699ffd8': 'JLcF20JwYvpv6uAGcLWIaV12jKwaL1R8us4b6Zkg',
+ 'fs2go_att_att_web_stage_807c5001955fc114a3331fe027ddc76e': 'gbu1oO1y0JiOFh4SUipt86P288JHpyjSqolrrT1x',
+ 'fs2go_fs2go_tudor_web_prod_a7dd8e5a7cdc830cae55eae6f3e9fee5ee49eb9b': 'ipcp87VCEZXPPe868j3orLqzc03oTy7DXsGkAXXH',
+ 'anvato_mhz_app_web_prod_b808218b30de7fdf60340cbd9831512bc1bf6d37': 'Stlm5Gs6BEhJLRTZHcNquyzxGqr23EuFmE5DCgjX',
+ 'fs2go_charter_charter_web_stage_c2c6e5a68375a1bf00fff213d3ff8f61a835a54c': 'Lz4hbJp1fwL6jlcz4M2PMzghM4jp4aAmybtT5dPc',
+ 'fs2go_charter_charter_web_prod_ebfe3b10f1af215a7321cd3d629e0b81dfa6fa8c': 'vUJsK345A1bVmyYDRhZX0lqFIgVXuqhmuyp1EtPK',
+ 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b': 'GDKq1ixvX3MoBNdU5IOYmYa2DTUXYOozPjrCJnW7',
+ 'anvato_epfox_app_web_stage_a3c2ce60f8f83ef374a88b68ee73a950f8ab87ce': '2jz2NH4BsXMaDsoJ5qkHMbcczAfIReo2eFYuVC1C',
+ 'fs2go_verizon_verizon_web_stage_08e6df0354a4803f1b1f2428b5a9a382e8dbcd62': 'rKTVapNaAcmnUbGL4ZcuOoY4SE7VmZSQsblPFr7e',
+ 'fs2go_verizon_verizon_web_prod_f909564cb606eff1f731b5e22e0928676732c445': 'qLSUuHerM3u9eNPzaHyUK52obai5MvE4XDJfqYe1',
+ 'fs2go_foxcom_synd_web_stage_f7b9091f00ea25a4fdaaae77fca5b54cdc7e7043': '96VKF2vLd24fFiDfwPFpzM5llFN4TiIGAlodE0Re',
+ 'fs2go_foxcom_synd_web_prod_0f2cdd64d87e4ab6a1d54aada0ff7a7c8387a064': 'agiPjbXEyEZUkbuhcnmVPhe9NNVbDjCFq2xkcx51',
+ 'anvato_own_app_web_stage_1214ade5d28422c4dae9d03c1243aba0563c4dba': 'mzhamNac3swG4WsJAiUTacnGIODi6SWeVWk5D7ho',
+ 'anvato_own_app_web_prod_944e162ed927ec3e9ed13eb68ed2f1008ee7565e': '9TSxh6G2TXOLBoYm9ro3LdNjjvnXpKb8UR8KoIP9',
+ 'anvato_scripps_app_ftv_prod_a10a10468edd5afb16fb48171c03b956176afad1': 'COJ2i2UIPK7xZqIWswxe7FaVBOVgRkP1F6O6qGoH',
+ 'anvato_scripps_app_ftv_stage_77d3ad2bdb021ec37ca2e35eb09acd396a974c9a': 'Q7nnopNLe2PPfGLOTYBqxSaRpl209IhqaEuDZi1F',
+ 'anvato_univision_app_web_stage_551236ef07a0e17718c3995c35586b5ed8cb5031': 'D92PoLS6UitwxDRA191HUGT9OYcOjV6mPMa5wNyo',
+ 'anvato_univision_app_web_prod_039a5c0a6009e637ae8ac906718a79911e0e65e1': '5mVS5u4SQjtw6NGw2uhMbKEIONIiLqRKck5RwQLR',
+ 'nbcu_cnbc_springfield_ios_prod_670207fae43d6e9a94c351688851a2ce': 'M7fqCCIP9lW53oJbHs19OlJlpDrVyc2OL8gNeuTa',
+ 'nbcu_cnbc_springfieldvod_ios_prod_7a5f04b1ceceb0e9c9e2264a44aa236e08e034c2': 'Yia6QbJahW0S7K1I0drksimhZb4UFq92xLBmmMvk',
+ 'anvato_cox_app_web_prod_ce45cda237969f93e7130f50ee8bb6280c1484ab': 'cc0miZexpFtdoqZGvdhfXsLy7FXjRAOgb9V0f5fZ',
+ 'anvato_cox_app_web_stage_c23dbe016a8e9d8c7101d10172b92434f6088bf9': 'yivU3MYHd2eDZcOfmLbINVtqxyecKTOp8OjOuoGJ',
+ 'anvato_chnzero_app_web_stage_b1164d1352b579e792e542fddf13ee34c0eeb46b': 'A76QkXMmVH8lTCfU15xva1mZnSVcqeY4Xb22Kp7m',
+ 'anvato_chnzero_app_web_prod_253d358928dc08ec161eda2389d53707288a730c': 'OA5QI3ZWZZkdtUEDqh28AH8GedsF6FqzJI32596b',
+ 'anvato_discovery_vodpoc_web_stage_9fa7077b5e8af1f8355f65d4fb8d2e0e9d54e2b7': 'q3oT191tTQ5g3JCP67PkjLASI9s16DuWZ6fYmry3',
+ 'anvato_discovery_vodpoc_web_prod_688614983167a1af6cdf6d76343fda10a65223c1': 'qRvRQCTVHd0VVOHsMvvfidyWmlYVrTbjby7WqIuK',
+ 'nbcu_cnbc_springfieldvod_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+ 'nbcu_cnbc_springfield_ftv_stage_826040aad1925a46ac5dfb4b3c5143e648c6a30d': 'JQaSb5a8Tz0PT4ti329DNmzDO30TnngTHmvX8Vua',
+ 'nbcu_nbcd_capture_web_stage_4dd9d585bfb984ebf856dee35db027b2465cc4ae': '0j1Ov4Vopyi2HpBZJYdL2m8ERJVGYh3nNpzPiO8F',
+ 'nbcu_nbcd_watch3_android_prod_7712ca5fcf1c22f19ec1870a9650f9c37db22dcf': '3LN2UB3rPUAMu7ZriWkHky9vpLMXYha8JbSnxBlx',
+ 'nbcu_nbcd_watchvod3_android_prod_0910a3a4692d57c0b5ff4316075bc5d096be45b9': 'mJagcQ2II30vUOAauOXne7ERwbf5S9nlB3IP17lQ',
+ 'anvato_scripps_app_atv_prod_790deda22e16e71e83df58f880cd389908a45d52': 'CB6trI1mpoDIM5o54DNTsji90NDBQPZ4z4RqBNSH',
+ 'nbcu_nbcd_watchv4_android_prod_ff67cef9cb409158c6f8c3533edddadd0b750507': 'j8CHQCUWjlYERj4NFRmUYOND85QNbHViH09UwuKm',
+ 'nbcu_nbcd_watchvodv4_android_prod_a814d781609989dea6a629d50ae4c7ad8cc8e907': 'rkVnUXxdA9rawVLUlDQtMue9Y4Q7lFEaIotcUhjt',
+ 'rvVKpA50qlOPLFxMjrCGf5pdkdQDm7qn': '1J7ZkY5Qz5lMLi93QOH9IveE7EYB3rLl',
+ 'nbcu_dtv_local_web_prod_b266cf49defe255fd4426a97e27c09e513e9f82f': 'HuLnJDqzLa4saCzYMJ79zDRSQpEduw1TzjMNQu2b',
+ 'nbcu_att_local_web_prod_4cef038b2d969a6b7d700a56a599040b6a619f67': 'Q0Em5VDc2KpydUrVwzWRXAwoNBulWUxCq2faK0AV',
+ 'nbcu_dish_local_web_prod_c56dcaf2da2e9157a4266c82a78195f1dd570f6b': 'bC1LWmRz9ayj2AlzizeJ1HuhTfIaJGsDBnZNgoRg',
+ 'nbcu_verizon_local_web_prod_88bebd2ce006d4ed980de8133496f9a74cb9b3e1': 'wzhDKJZpgvUSS1EQvpCQP8Q59qVzcPixqDGJefSk',
+ 'nbcu_charter_local_web_prod_9ad90f7fc4023643bb718f0fe0fd5beea2382a50': 'PyNbxNhEWLzy1ZvWEQelRuIQY88Eub7xbSVRMdfT',
+ 'nbcu_suddenlink_local_web_prod_20fb711725cac224baa1c1cb0b1c324d25e97178': '0Rph41lPXZbb3fqeXtHjjbxfSrNbtZp1Ygq7Jypa',
+ 'nbcu_wow_local_web_prod_652d9ce4f552d9c2e7b5b1ed37b8cb48155174ad': 'qayIBZ70w1dItm2zS42AptXnxW15mkjRrwnBjMPv',
+ 'nbcu_centurylink_local_web_prod_2034402b029bf3e837ad46814d9e4b1d1345ccd5': 'StePcPMkjsX51PcizLdLRMzxMEl5k2FlsMLUNV4k',
+ 'nbcu_atlanticbrd_local_web_prod_8d5f5ecbf7f7b2f5e6d908dd75d90ae3565f682e': 'NtYLb4TFUS0pRs3XTkyO5sbVGYjVf17bVbjaGscI',
+ 'nbcu_nbcd_watchvod_web_dev_08bc05699be47c4f31d5080263a8cfadc16d0f7c': 'hwxi2dgDoSWgfmVVXOYZm14uuvku4QfopstXckhr',
+ 'anvato_nextmedia_app_web_prod_a4fa8c7204aa65e71044b57aaf63711980cfe5a0': 'tQN1oGPYY1nM85rJYePWGcIb92TG0gSqoVpQTWOw',
+ 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749': 'GUXNf5ZDX2jFUpu4WT2Go4DJ5nhUCzpnwDRRUx1K',
+ 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa': 'bLDYF8JqfG42b7bwKEgQiU9E2LTIAtnKzSgYpFUH',
+ 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a': 'icgGoYGipQMMSEvhplZX1pwbN69srwKYWksz3xWK',
+ 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336': 'fA2iQdI7RDpynqzQYIpXALVS83NTPr8LLFK4LFsu',
+ 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+ 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900': 'rMOUZqe9lwcGq2mNgG3EDusm6lKgsUnczoOX3mbg',
+ 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99': 'P3uXJ0fXXditBPCGkfvlnVScpPEfKmc64Zv7ZgbK',
+ 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe': 'mGPvo5ZA5SgjOFAPEPXv7AnOpFUICX8hvFQVz69n',
+ 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582': 'qyT6PXXLjVNCrHaRVj0ugAhalNRS7Ee9BP7LUokD',
+ 'nbcu_nbcd_watchvodv4_web_stage_4108362fba2d4ede21f262fea3c4162cbafd66c7': 'DhaU5lj0W2gEdcSSsnxURq8t7KIWtJfD966crVDk',
+ 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn',
+ 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W',
+ 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ',
+ 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
+ }
+
+ _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
+
+ def __init__(self, *args, **kwargs):
+ super(AnvatoIE, self).__init__(*args, **kwargs)
+ self.__server_time = None
+
+ def _server_time(self, access_key, video_id):
+ if self.__server_time is not None:
+ return self.__server_time
+
+ self.__server_time = int(self._download_json(
+ self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id,
+ note='Fetching server time')['server_time'])
+
+ return self.__server_time
+
+ def _api_prefix(self, access_key):
+ return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage')
+
+ def _get_video_json(self, access_key, video_id):
+ # See et() in anvplayer.min.js, which is an alias of getVideoJSON()
+ video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key)
+ server_time = self._server_time(access_key, video_id)
+ input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time))
+
+ auth_secret = intlist_to_bytes(aes_encrypt(
+ bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY)))
+
+ video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii')
+ anvrid = md5_text(time.time() * 1000 * random.random())[:30]
+ payload = {
+ 'api': {
+ 'anvrid': anvrid,
+ 'anvstk': md5_text('%s|%s|%d|%s' % (
+ access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])),
+ 'anvts': server_time,
+ },
+ }
+
+ return self._download_json(
+ video_data_url, video_id, transform_source=strip_jsonp,
+ data=json.dumps(payload).encode('utf-8'))
+
+ def _extract_anvato_videos(self, webpage, video_id):
+ anvplayer_data = self._parse_json(self._html_search_regex(
+ r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
+ 'Anvato player data'), video_id)
+
+ video_id = anvplayer_data['video']
+ access_key = anvplayer_data['accessKey']
+
+ video_data = self._get_video_json(access_key, video_id)
+
+ formats = []
+ for published_url in video_data['published_urls']:
+ video_url = published_url['embed_url']
+ ext = determine_ext(video_url)
+
+ if ext == 'smil':
+ formats.extend(self._extract_smil_formats(video_url, video_id))
+ continue
+
+ tbr = int_or_none(published_url.get('kbps'))
+ a_format = {
+ 'url': video_url,
+ 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(),
+ 'tbr': tbr if tbr != 0 else None,
+ }
+
+ if ext == 'm3u8':
+ # Not using _extract_m3u8_formats here as individual media
+ # playlists are also included in published_urls.
+ if tbr is None:
+ formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
+ continue
+ else:
+ a_format.update({
+ 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
+ 'ext': 'mp4',
+ })
+ elif ext == 'mp3':
+ a_format['vcodec'] = 'none'
+ else:
+ a_format.update({
+ 'width': int_or_none(published_url.get('width')),
+ 'height': int_or_none(published_url.get('height')),
+ })
+ formats.append(a_format)
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for caption in video_data.get('captions', []):
+ a_caption = {
+ 'url': caption['url'],
+ 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None
+ }
+ subtitles.setdefault(caption['language'], []).append(a_caption)
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': video_data.get('def_title'),
+ 'description': video_data.get('def_description'),
+ 'categories': video_data.get('categories'),
+ 'thumbnail': video_data.get('thumbnail'),
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index b51eafc..42c21bf 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -1,70 +1,133 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
class AolIE(InfoExtractor):
IE_NAME = 'on.aol.com'
- _VALID_URL = r'''(?x)
- (?:
- aol-video:|
- http://on\.aol\.com/
- (?:
- video/.*-|
- playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid=
- )
- )
- (?P<id>[0-9]+)
- (?:$|\?)
- '''
+ _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P<id>[^/?#&]+)'
_TESTS = [{
+ # video with 5min ID
'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
'md5': '18ef68f48740e86ae94b98da815eec42',
'info_dict': {
'id': '518167793',
'ext': 'mp4',
'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
+ 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.',
+ 'timestamp': 1395405060,
+ 'upload_date': '20140321',
+ 'uploader': 'Newsy Studio',
},
- 'add_ie': ['FiveMin'],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
}, {
- 'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316',
+ # video with vidible ID
+ 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183',
'info_dict': {
- 'id': '152147',
- 'title': 'Brace Yourself - Today\'s Weirdest News',
+ 'id': '5707d6b8e4b090497b04f706',
+ 'ext': 'mp4',
+ 'title': 'Netflix is Raising Rates',
+ 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.',
+ 'upload_date': '20160408',
+ 'timestamp': 1460123280,
+ 'uploader': 'Veuer',
},
- 'playlist_mincount': 10,
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://on.aol.com/partners/abc-551438d309eab105804dbfe8/sneak-peek-was-haley-really-framed-570eaebee4b0448640a5c944',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://on.aol.com/video/519442220',
+ 'only_matching': True,
+ }, {
+ 'url': 'aol-video:5707d6b8e4b090497b04f706',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- playlist_id = mobj.group('playlist_id')
- if not playlist_id or self._downloader.params.get('noplaylist'):
- return self.url_result('5min:%s' % video_id)
+ video_id = self._match_id(url)
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ response = self._download_json(
+ 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id,
+ video_id)['response']
+ if response['statusText'] != 'Ok':
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True)
- webpage = self._download_webpage(url, playlist_id)
- title = self._html_search_regex(
- r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
- playlist_html = self._search_regex(
- r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
- 'playlist HTML')
- entries = [{
- '_type': 'url',
- 'url': 'aol-video:%s' % m.group('id'),
- 'ie_key': 'Aol',
- } for m in re.finditer(
- r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
- playlist_html)]
+ video_data = response['data']
+ formats = []
+ m3u8_url = video_data.get('videoMasterPlaylist')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ for rendition in video_data.get('renditions', []):
+ video_url = rendition.get('url')
+ if not video_url:
+ continue
+ ext = rendition.get('format')
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ else:
+ f = {
+ 'url': video_url,
+ 'format_id': rendition.get('quality'),
+ }
+ mobj = re.search(r'(\d+)x(\d+)', video_url)
+ if mobj:
+ f.update({
+ 'width': int(mobj.group(1)),
+ 'height': int(mobj.group(2)),
+ })
+ formats.append(f)
+ self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'display_id': mobj.group('playlist_display_id'),
- 'title': title,
- 'entries': entries,
+ 'id': video_id,
+ 'title': video_data['title'],
+ 'duration': int_or_none(video_data.get('duration')),
+ 'timestamp': int_or_none(video_data.get('publishDate')),
+ 'view_count': int_or_none(video_data.get('views')),
+ 'description': video_data.get('description'),
+ 'uploader': video_data.get('videoOwner'),
+ 'formats': formats,
}
+
+
+class AolFeaturesIE(InfoExtractor):
+ IE_NAME = 'features.aol.com'
+ _VALID_URL = r'https?://features\.aol\.com/video/(?P<id>[^/?#]+)'
+
+ _TESTS = [{
+ 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts',
+ 'md5': '7db483bb0c09c85e241f84a34238cc75',
+ 'info_dict': {
+ 'id': '519507715',
+ 'ext': 'mp4',
+ 'title': 'What To Watch - February 17, 2016',
+ },
+ 'add_ie': ['FiveMin'],
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ return self.url_result(self._search_regex(
+ r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"',
+ webpage, '5min embed url'), 'FiveMin')
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index be40f85..a6801f3 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -7,6 +7,8 @@ from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
int_or_none,
+ parse_duration,
+ unified_strdate,
)
@@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor):
_TESTS = [{
'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
'info_dict': {
- 'id': 'manofsteel',
+ 'id': '5111',
+ 'title': 'Man of Steel',
},
'playlist': [
{
@@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor):
'id': 'blackthorn',
},
'playlist_mincount': 2,
+ 'expected_warnings': ['Unable to download JSON metadata'],
+ }, {
+ # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json
+ 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/',
+ 'info_dict': {
+ 'id': '15881',
+ 'title': 'Kung Fu Panda 3',
+ },
+ 'playlist_mincount': 4,
}, {
'url': 'http://trailers.apple.com/ca/metropole/autrui/',
'only_matching': True,
@@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor):
movie = mobj.group('movie')
uploader_id = mobj.group('company')
+ webpage = self._download_webpage(url, movie)
+ film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id')
+ film_data = self._download_json(
+ 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id,
+ film_id, fatal=False)
+
+ if film_data:
+ entries = []
+ for clip in film_data.get('clips', []):
+ clip_title = clip['title']
+
+ formats = []
+ for version, version_data in clip.get('versions', {}).items():
+ for size, size_data in version_data.get('sizes', {}).items():
+ src = size_data.get('src')
+ if not src:
+ continue
+ formats.append({
+ 'format_id': '%s-%s' % (version, size),
+ 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src),
+ 'width': int_or_none(size_data.get('width')),
+ 'height': int_or_none(size_data.get('height')),
+ 'language': version[:2],
+ })
+ self._sort_formats(formats)
+
+ entries.append({
+ 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(),
+ 'formats': formats,
+ 'title': clip_title,
+ 'thumbnail': clip.get('screen') or clip.get('thumb'),
+ 'duration': parse_duration(clip.get('runtime') or clip.get('faded')),
+ 'upload_date': unified_strdate(clip.get('posted')),
+ 'uploader_id': uploader_id,
+ })
+
+ page_data = film_data.get('page', {})
+ return self.playlist_result(entries, film_id, page_data.get('movie_title'))
+
playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
def fix_html(s):
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 9fb8491..fd45b3e 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -8,7 +8,6 @@ from .generic import GenericIE
from ..utils import (
determine_ext,
ExtractorError,
- get_element_by_attribute,
qualities,
int_or_none,
parse_duration,
@@ -83,7 +82,7 @@ class ARDMediathekIE(InfoExtractor):
subtitle_url = media_info.get('_subtitleUrl')
if subtitle_url:
subtitles['de'] = [{
- 'ext': 'srt',
+ 'ext': 'ttml',
'url': subtitle_url,
}]
@@ -274,41 +273,3 @@ class ARDIE(InfoExtractor):
'upload_date': upload_date,
'thumbnail': thumbnail,
}
-
-
-class SportschauIE(ARDMediathekIE):
- IE_NAME = 'Sportschau'
- _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
- _TESTS = [{
- 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
- 'info_dict': {
- 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
- 'ext': 'mp4',
- 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- base_url = mobj.group('baseurl')
-
- webpage = self._download_webpage(url, video_id)
- title = get_element_by_attribute('class', 'headline', webpage)
- description = self._html_search_meta('description', webpage, 'description')
-
- info = self._extract_media_info(
- base_url + '-mc_defaultQuality-h.json', webpage, video_id)
-
- info.update({
- 'title': title,
- 'description': description,
- })
-
- return info
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index efde7e2..049f1fa 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -23,7 +23,7 @@ from ..utils import (
class ArteTvIE(InfoExtractor):
- _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html'
+ _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html'
IE_NAME = 'arte.tv'
def _real_extract(self, url):
@@ -61,10 +61,7 @@ class ArteTvIE(InfoExtractor):
}
-class ArteTVPlus7IE(InfoExtractor):
- IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])'
-
+class ArteTVBaseIE(InfoExtractor):
@classmethod
def _extract_url_info(cls, url):
mobj = re.match(cls._VALID_URL, url)
@@ -78,58 +75,7 @@ class ArteTVPlus7IE(InfoExtractor):
video_id = mobj.group('id')
return video_id, lang
- def _real_extract(self, url):
- video_id, lang = self._extract_url_info(url)
- webpage = self._download_webpage(url, video_id)
- return self._extract_from_webpage(webpage, video_id, lang)
-
- def _extract_from_webpage(self, webpage, video_id, lang):
- patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
- ids = (video_id, '')
- # some pages contain multiple videos (like
- # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
- # so we first try to look for json URLs that contain the video id from
- # the 'vid' parameter.
- patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
- json_url = self._html_search_regex(
- patterns, webpage, 'json vp url', default=None)
- if not json_url:
- def find_iframe_url(webpage, default=NO_DEFAULT):
- return self._html_search_regex(
- r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
- webpage, 'iframe url', group='url', default=default)
-
- iframe_url = find_iframe_url(webpage, None)
- if not iframe_url:
- embed_url = self._html_search_regex(
- r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
- if embed_url:
- player = self._download_json(
- embed_url, video_id, 'Downloading player page')
- iframe_url = find_iframe_url(player['html'])
- # en and es URLs produce react-based pages with different layout (e.g.
- # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
- if not iframe_url:
- program = self._search_regex(
- r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
- webpage, 'program', default=None)
- if program:
- embed_html = self._parse_json(program, video_id)
- if embed_html:
- iframe_url = find_iframe_url(embed_html['embed_html'])
- if iframe_url:
- json_url = compat_parse_qs(
- compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
- if json_url:
- return self._extract_from_json_url(json_url, video_id, lang)
- # Differend kind of embed URL (e.g.
- # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
- embed_url = self._search_regex(
- r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',
- webpage, 'embed url', group='url')
- return self.url_result(embed_url)
-
- def _extract_from_json_url(self, json_url, video_id, lang):
+ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
@@ -137,7 +83,7 @@ class ArteTVPlus7IE(InfoExtractor):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
- title = player_info['VTI'].strip()
+ title = (player_info.get('VTI') or title or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
@@ -158,24 +104,53 @@ class ArteTVPlus7IE(InfoExtractor):
'es': 'E[ESP]',
}
+ langcode = LANGS.get(lang, lang)
+
formats = []
for format_id, format_dict in player_info['VSR'].items():
f = dict(format_dict)
versionCode = f.get('versionCode')
- langcode = LANGS.get(lang, lang)
- lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)]
- lang_pref = None
- if versionCode:
- matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)]
- lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs)
- source_pref = 0
- if versionCode is not None:
- # The original version with subtitles has lower relevance
- if re.match(r'VO-ST(F|A|E)', versionCode):
- source_pref -= 10
- # The version with sourds/mal subtitles has also lower relevance
- elif re.match(r'VO?(F|A|E)-STM\1', versionCode):
- source_pref -= 9
+ l = re.escape(langcode)
+
+ # Language preference from most to least priority
+ # Reference: section 5.6.3 of
+ # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf
+ PREFERENCES = (
+ # original version in requested language, without subtitles
+ r'VO{0}$'.format(l),
+ # original version in requested language, with partial subtitles in requested language
+ r'VO{0}-ST{0}$'.format(l),
+ # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+ r'VO{0}-STM{0}$'.format(l),
+ # non-original (dubbed) version in requested language, without subtitles
+ r'V{0}$'.format(l),
+ # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
+ r'V{0}-ST{0}$'.format(l),
+ # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
+ r'V{0}-STM{0}$'.format(l),
+ # original version in requested language, with partial subtitles in different language
+ r'VO{0}-ST(?!{0}).+?$'.format(l),
+ # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
+ r'VO{0}-STM(?!{0}).+?$'.format(l),
+ # original version in different language, with partial subtitles in requested language
+ r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
+ # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
+ r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
+ # original version in different language, without subtitles
+ r'VO(?:(?!{0}))?$'.format(l),
+ # original version in different language, with partial subtitles in different language
+ r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
+ # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
+ r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
+ )
+
+ for pref, p in enumerate(PREFERENCES):
+ if re.match(p, versionCode):
+ lang_pref = len(PREFERENCES) - pref
+ break
+ else:
+ lang_pref = -1
+
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@@ -185,7 +160,6 @@ class ArteTVPlus7IE(InfoExtractor):
'height': int_or_none(f.get('height')),
'tbr': int_or_none(f.get('bitrate')),
'quality': qfunc(f.get('quality')),
- 'source_preference': source_pref,
}
if f.get('mediaType') == 'rtmp':
@@ -204,28 +178,112 @@ class ArteTVPlus7IE(InfoExtractor):
return info_dict
+class ArteTVPlus7IE(ArteTVBaseIE):
+ IE_NAME = 'arte.tv:+7'
+ _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ video_id, lang = self._extract_url_info(url)
+ webpage = self._download_webpage(url, video_id)
+ return self._extract_from_webpage(webpage, video_id, lang)
+
+ def _extract_from_webpage(self, webpage, video_id, lang):
+ patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
+ ids = (video_id, '')
+ # some pages contain multiple videos (like
+ # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
+ # so we first try to look for json URLs that contain the video id from
+ # the 'vid' parameter.
+ patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
+ json_url = self._html_search_regex(
+ patterns, webpage, 'json vp url', default=None)
+ if not json_url:
+ def find_iframe_url(webpage, default=NO_DEFAULT):
+ return self._html_search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
+ webpage, 'iframe url', group='url', default=default)
+
+ iframe_url = find_iframe_url(webpage, None)
+ if not iframe_url:
+ embed_url = self._html_search_regex(
+ r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
+ if embed_url:
+ player = self._download_json(
+ embed_url, video_id, 'Downloading player page')
+ iframe_url = find_iframe_url(player['html'])
+ # en and es URLs produce react-based pages with different layout (e.g.
+ # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
+ if not iframe_url:
+ program = self._search_regex(
+ r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
+ webpage, 'program', default=None)
+ if program:
+ embed_html = self._parse_json(program, video_id)
+ if embed_html:
+ iframe_url = find_iframe_url(embed_html['embed_html'])
+ if iframe_url:
+ json_url = compat_parse_qs(
+ compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
+ if json_url:
+ title = self._search_regex(
+ r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
+ webpage, 'title', default=None, group='title')
+ return self._extract_from_json_url(json_url, video_id, lang, title=title)
+ # Different kind of embed URL (e.g.
+ # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
+ entries = [
+ self.url_result(url)
+ for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)]
+ return self.playlist_result(entries)
+
+
# It also uses the arte_vp_url url from the webpage to extract the information
class ArteTVCreativeIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:creative'
- _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:magazine?/)?(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
+ 'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1',
'info_dict': {
- 'id': '72176',
+ 'id': '057405-001-A',
'ext': 'mp4',
- 'title': 'Folge 2 - Corporate Design',
- 'upload_date': '20131004',
+ 'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)',
+ 'upload_date': '20150716',
},
}, {
'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion',
+ 'playlist_count': 11,
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde',
+ 'only_matching': True,
+ }]
+
+
+class ArteTVInfoIE(ArteTVPlus7IE):
+ IE_NAME = 'arte.tv:info'
+ _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
'info_dict': {
- 'id': '160676',
+ 'id': '067528-000-A',
'ext': 'mp4',
- 'title': 'Monty Python live (mostly)',
- 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n',
- 'upload_date': '20140805',
- }
+ 'title': 'Service civique, un cache misère ?',
+ 'upload_date': '20160403',
+ },
}]
@@ -251,6 +309,8 @@ class ArteTVDDCIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:ddc'
_VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
+ _TESTS = []
+
def _real_extract(self, url):
video_id, lang = self._extract_url_info(url)
if lang == 'folge':
@@ -269,7 +329,7 @@ class ArteTVConcertIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:concert'
_VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
'md5': '9ea035b7bd69696b67aa2ccaaa218161',
'info_dict': {
@@ -279,24 +339,23 @@ class ArteTVConcertIE(ArteTVPlus7IE):
'upload_date': '20140128',
'description': 'md5:486eb08f991552ade77439fe6d82c305',
},
- }
+ }]
class ArteTVCinemaIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:cinema'
_VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
- _TEST = {
- 'url': 'http://cinema.arte.tv/de/node/38291',
- 'md5': '6b275511a5107c60bacbeeda368c3aa1',
+ _TESTS = [{
+ 'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck',
+ 'md5': 'a5b9dd5575a11d93daf0e3f404f45438',
'info_dict': {
- 'id': '055876-000_PWA12025-D',
+ 'id': '062494-000-A',
'ext': 'mp4',
- 'title': 'Tod auf dem Nil',
- 'upload_date': '20160122',
- 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc',
+ 'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck',
+ 'upload_date': '20150807',
},
- }
+ }]
class ArteTVMagazineIE(ArteTVPlus7IE):
@@ -334,16 +393,48 @@ class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
http://www\.arte\.tv
- /playerv2/embed\.php\?json_url=
+ /(?:playerv2/embed|arte_vp/index)\.php\?json_url=
(?P<json_url>
http://arte\.tv/papi/tvguide/videos/stream/player/
(?P<lang>[^/]+)/(?P<id>[^/]+)[^&]*
)
'''
+ _TESTS = []
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang')
json_url = mobj.group('json_url')
return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVPlaylistIE(ArteTVBaseIE):
+ IE_NAME = 'arte.tv:playlist'
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)'
+
+ _TESTS = [{
+ 'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV',
+ 'info_dict': {
+ 'id': 'PL-013263',
+ 'title': 'Areva & Uramin',
+ },
+ 'playlist_mincount': 6,
+ }, {
+ 'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ playlist_id, lang = self._extract_url_info(url)
+ collection = self._download_json(
+ 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
+ % (lang, playlist_id), playlist_id)
+ title = collection.get('title')
+ description = collection.get('shortDescription') or collection.get('teaserText')
+ entries = [
+ self._extract_from_json_url(
+ video['jsonUrl'], video.get('programId') or playlist_id, lang)
+ for video in collection['videos'] if video.get('jsonUrl')]
+ return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index b8f9ae0..d2f3889 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -6,16 +6,14 @@ import hashlib
import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urllib_parse,
-)
+from ..compat import compat_str
from ..utils import (
- int_or_none,
+ ExtractorError,
float_or_none,
+ int_or_none,
sanitized_Request,
+ urlencode_postdata,
xpath_text,
- ExtractorError,
)
@@ -86,7 +84,7 @@ class AtresPlayerIE(InfoExtractor):
}
request = sanitized_Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py
index 3b2effa..aa69256 100644
--- a/youtube_dl/extractor/audimedia.py
+++ b/youtube_dl/extractor/audimedia.py
@@ -10,9 +10,9 @@ from ..utils import (
class AudiMediaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)'
_TEST = {
- 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test',
+ 'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',
'md5': '79a8b71c46d49042609795ab59779b66',
'info_dict': {
'id': '1565',
@@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload')
+ raw_payload = self._search_regex([
+ r'class="amtv-embed"[^>]+id="([^"]+)"',
+ r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"',
+ ], webpage, 'raw payload')
_, stage_mode, video_id, lang = raw_payload.split('-')
# TODO: handle s and e stage_mode (live streams and ended live streams)
@@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor):
video_version_url = video_version.get('download_url') or video_version.get('stream_url')
if not video_version_url:
continue
- formats.append({
+ f = {
'url': video_version_url,
'width': int_or_none(video_version.get('width')),
'height': int_or_none(video_version.get('height')),
'abr': int_or_none(video_version.get('audio_bitrate')),
'vbr': int_or_none(video_version.get('video_bitrate')),
- })
+ }
+ bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None)
+ if bitrate:
+ f.update({
+ 'format_id': 'http-%s' % bitrate,
+ })
+ formats.append(f)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
new file mode 100644
index 0000000..2ec2d70
--- /dev/null
+++ b/youtube_dl/extractor/audioboom.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class AudioBoomIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0',
+ 'md5': '63a8d73a055c6ed0f1e51921a10a5a76',
+ 'info_dict': {
+ 'id': '4279833',
+ 'ext': 'mp3',
+ 'title': '3/09/2016 Czaban Hour 3',
+ 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
+ 'duration': 2245.72,
+ 'uploader': 'Steve Czaban',
+ 'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ clip = None
+
+ clip_store = self._parse_json(
+ self._search_regex(
+ r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id,
+ webpage, 'clip store', default='{}', group='json'),
+ video_id, fatal=False)
+ if clip_store:
+ clips = clip_store.get('clips')
+ if clips and isinstance(clips, list) and isinstance(clips[0], dict):
+ clip = clips[0]
+
+ def from_clip(field):
+ if clip:
+ clip.get(field)
+
+ audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
+ 'audio', webpage, 'audio url')
+ title = from_clip('title') or self._og_search_title(webpage)
+ description = from_clip('description') or self._og_search_description(webpage)
+
+ duration = float_or_none(from_clip('duration') or self._html_search_meta(
+ 'weibo:audio:duration', webpage))
+
+ uploader = from_clip('author') or self._og_search_property(
+ 'audio:artist', webpage, 'uploader', fatal=False)
+ uploader_url = from_clip('author_url') or self._html_search_meta(
+ 'audioboo:channel', webpage, 'uploader url')
+
+ return {
+ 'id': video_id,
+ 'url': audio_url,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_url': uploader_url,
+ }
diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py
index 3eed912..f3bd4d4 100644
--- a/youtube_dl/extractor/audiomack.py
+++ b/youtube_dl/extractor/audiomack.py
@@ -6,6 +6,7 @@ import time
from .common import InfoExtractor
from .soundcloud import SoundcloudIE
+from ..compat import compat_str
from ..utils import (
ExtractorError,
url_basename,
@@ -30,14 +31,14 @@ class AudiomackIE(InfoExtractor):
# audiomack wrapper around soundcloud song
{
'add_ie': ['Soundcloud'],
- 'url': 'http://www.audiomack.com/song/xclusiveszone/take-kare',
+ 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle',
'info_dict': {
- 'id': '172419696',
+ 'id': '258901379',
'ext': 'mp3',
- 'description': 'md5:1fc3272ed7a635cce5be1568c2822997',
- 'title': 'Young Thug ft Lil Wayne - Take Kare',
- 'uploader': 'Young Thug World',
- 'upload_date': '20141016',
+ 'description': 'mamba day freestyle for the legend Kobe Bryant ',
+ 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]',
+ 'uploader': 'ILOVEMAKONNEN',
+ 'upload_date': '20160414',
}
},
]
@@ -136,7 +137,7 @@ class AudiomackAlbumIE(InfoExtractor):
result[resultkey] = api_response[apikey]
song_id = url_basename(api_response['url']).rpartition('.')[0]
result['entries'].append({
- 'id': api_response.get('id', song_id),
+ 'id': compat_str(api_response.get('id', song_id)),
'uploader': api_response.get('artist'),
'title': api_response.get('title', song_id),
'url': api_response['url'],
diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py
index 011edf1..a813eb4 100644
--- a/youtube_dl/extractor/azubu.py
+++ b/youtube_dl/extractor/azubu.py
@@ -46,6 +46,7 @@ class AzubuIE(InfoExtractor):
'uploader_id': 272749,
'view_count': int,
},
+ 'skip': 'Channel offline',
},
]
@@ -56,22 +57,26 @@ class AzubuIE(InfoExtractor):
'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
title = data['title'].strip()
- description = data['description']
- thumbnail = data['thumbnail']
- view_count = data['view_count']
- uploader = data['user']['username']
- uploader_id = data['user']['id']
+ description = data.get('description')
+ thumbnail = data.get('thumbnail')
+ view_count = data.get('view_count')
+ user = data.get('user', {})
+ uploader = user.get('username')
+ uploader_id = user.get('id')
stream_params = json.loads(data['stream_params'])
- timestamp = float_or_none(stream_params['creationDate'], 1000)
- duration = float_or_none(stream_params['length'], 1000)
+ timestamp = float_or_none(stream_params.get('creationDate'), 1000)
+ duration = float_or_none(stream_params.get('length'), 1000)
renditions = stream_params.get('renditions') or []
video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
if video:
renditions.append(video)
+ if not renditions and not user.get('channel', {}).get('is_live', True):
+ raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True)
+
formats = [{
'url': fmt['url'],
'width': fmt['frameWidth'],
@@ -98,7 +103,7 @@ class AzubuIE(InfoExtractor):
class AzubuLiveIE(InfoExtractor):
- _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$'
+ _VALID_URL = r'https?://www.azubu.tv/(?P<id>[^/]+)$'
_TEST = {
'url': 'http://www.azubu.tv/MarsTVMDLen',
@@ -120,6 +125,7 @@ class AzubuLiveIE(InfoExtractor):
bc_info = self._download_json(req, user)
m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS')
formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4')
+ self._sort_formats(formats)
return {
'id': info['id'],
diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py
index 76b21e5..234a661 100644
--- a/youtube_dl/extractor/baidu.py
+++ b/youtube_dl/extractor/baidu.py
@@ -9,7 +9,7 @@ from ..utils import unescapeHTML
class BaiduVideoIE(InfoExtractor):
IE_DESC = '百度视频'
- _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
+ _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm'
_TESTS = [{
'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6',
'info_dict': {
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
index da986e0..0eb1930 100644
--- a/youtube_dl/extractor/bambuser.py
+++ b/youtube_dl/extractor/bambuser.py
@@ -4,15 +4,13 @@ import re
import itertools
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_str,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
- int_or_none,
float_or_none,
+ int_or_none,
sanitized_Request,
+ urlencode_postdata,
)
@@ -58,7 +56,7 @@ class BambuserIE(InfoExtractor):
}
request = sanitized_Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Referer', self._LOGIN_URL)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index c1ef805..991ab06 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -29,7 +29,7 @@ class BandcampIE(InfoExtractor):
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '2b68e5851514c20efdff2afc5603b8b4',
+ 'md5': '73d0b3171568232574e45652f8720b5c',
'info_dict': {
'id': '2650410135',
'ext': 'mp3',
@@ -48,6 +48,10 @@ class BandcampIE(InfoExtractor):
if m_trackinfo:
json_code = m_trackinfo.group(1)
data = json.loads(json_code)[0]
+ track_id = compat_str(data['id'])
+
+ if not data.get('file'):
+ raise ExtractorError('Not streamable', video_id=track_id, expected=True)
formats = []
for format_id, format_url in data['file'].items():
@@ -64,7 +68,7 @@ class BandcampIE(InfoExtractor):
self._sort_formats(formats)
return {
- 'id': compat_str(data['id']),
+ 'id': track_id,
'title': data['title'],
'formats': formats,
'duration': float_or_none(data.get('duration')),
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 9d0dfb9..4b3cd8c 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -10,7 +10,6 @@ from ..utils import (
int_or_none,
parse_duration,
parse_iso8601,
- remove_end,
unescapeHTML,
)
from ..compat import (
@@ -32,7 +31,7 @@ class BBCCoUkIE(InfoExtractor):
music/clips[/#]|
radio/player/
)
- (?P<id>%s)
+ (?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
_MEDIASELECTOR_URLS = [
@@ -193,6 +192,7 @@ class BBCCoUkIE(InfoExtractor):
# rtmp download
'skip_download': True,
},
+ 'skip': 'Now it\'s really geo-restricted',
}, {
# compact player (https://github.com/rg3/youtube-dl/issues/8147)
'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player',
@@ -329,6 +329,7 @@ class BBCCoUkIE(InfoExtractor):
'format_id': '%s_%s' % (service, format['format_id']),
'abr': abr,
'acodec': acodec,
+ 'vcodec': 'none',
})
formats.extend(conn_formats)
return formats
@@ -561,7 +562,7 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',
'info_dict': {
'id': '3662a707-0af9-3149-963f-47bea720b460',
- 'title': 'BBC Blogs - Adam Curtis - BUGGER',
+ 'title': 'BUGGER',
},
'playlist_count': 18,
}, {
@@ -670,10 +671,19 @@ class BBCIE(BBCCoUkIE):
'url': 'http://www.bbc.com/sport/0/football/34475836',
'info_dict': {
'id': '34475836',
- 'title': 'What Liverpool can expect from Klopp',
+ 'title': 'Jurgen Klopp: Furious football from a witty and winning coach',
+ 'description': 'Fast-paced football, wit, wisdom and a ready smile - why Liverpool fans should come to love new boss Jurgen Klopp.',
},
'playlist_count': 3,
}, {
+ # school report article with single video
+ 'url': 'http://www.bbc.co.uk/schoolreport/35744779',
+ 'info_dict': {
+ 'id': '35744779',
+ 'title': 'School which breaks down barriers in Jerusalem',
+ },
+ 'playlist_count': 1,
+ }, {
# single video with playlist URL from weather section
'url': 'http://www.bbc.com/weather/features/33601775',
'only_matching': True,
@@ -681,11 +691,17 @@ class BBCIE(BBCCoUkIE):
# custom redirection to www.bbc.com
'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
'only_matching': True,
+ }, {
+ # single video article embedded with data-media-vpid
+ 'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
+ 'only_matching': True,
}]
@classmethod
def suitable(cls, url):
- return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url)
+ EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+ return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
+ else super(BBCIE, cls).suitable(url))
def _extract_from_media_meta(self, media_meta, video_id):
# Direct links to media in media metadata (e.g.
@@ -735,8 +751,17 @@ class BBCIE(BBCCoUkIE):
json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
timestamp = json_ld_info.get('timestamp')
+
playlist_title = json_ld_info.get('title')
- playlist_description = json_ld_info.get('description')
+ if not playlist_title:
+ playlist_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'playlist title', default=None)
+ if playlist_title:
+ playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip()
+
+ playlist_description = json_ld_info.get(
+ 'description') or self._og_search_description(webpage, default=None)
if not timestamp:
timestamp = parse_iso8601(self._search_regex(
@@ -797,13 +822,11 @@ class BBCIE(BBCCoUkIE):
playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
if entries:
- playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News')
- playlist_description = playlist_description or self._og_search_description(webpage, default=None)
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
- [r'data-video-player-vpid="(%s)"' % self._ID_REGEX,
+ [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
r'<param[^>]+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX,
r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX],
webpage, 'vpid', default=None)
@@ -829,10 +852,6 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
- playlist_title = self._html_search_regex(
- r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title')
- playlist_description = self._og_search_description(webpage, default=None)
-
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
@@ -932,7 +951,7 @@ class BBCIE(BBCCoUkIE):
class BBCCoUkArticleIE(InfoExtractor):
- _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)'
IE_NAME = 'bbc.co.uk:article'
IE_DESC = 'BBC articles'
@@ -959,3 +978,72 @@ class BBCCoUkArticleIE(InfoExtractor):
r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)]
return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkPlaylistBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
+ for video_id in re.findall(
+ self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
+
+ title, description = self._extract_title_and_description(webpage)
+
+ return self.playlist_result(entries, playlist_id, title, description)
+
+
+class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:playlist'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+ _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
+ _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+ _TEST = {
+ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
+ 'info_dict': {
+ 'id': 'b05rcz9v',
+ 'title': 'The Disappearance',
+ 'description': 'French thriller serial about a missing teenager.',
+ },
+ 'playlist_mincount': 6,
+ }
+
+ def _extract_title_and_description(self, webpage):
+ title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
+ description = self._search_regex(
+ r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
+ webpage, 'description', fatal=False, group='value')
+ return title, description
+
+
+class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:playlist'
+ _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX
+ _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s'
+ _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)'
+ _TESTS = [{
+ 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+ 'info_dict': {
+ 'id': 'b05rcz9v',
+ 'title': 'The Disappearance - Clips - BBC Four',
+ 'description': 'French thriller serial about a missing teenager.',
+ },
+ 'playlist_mincount': 7,
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player',
+ 'only_matching': True,
+ }]
+
+ def _extract_title_and_description(self, webpage):
+ title = self._og_search_title(webpage, fatal=False)
+ description = self._og_search_description(webpage)
+ return title, description
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index 34c2a75..956c768 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -33,8 +33,33 @@ class BeegIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ cpl_url = self._search_regex(
+ r'<script[^>]+src=(["\'])(?P<url>(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1',
+ webpage, 'cpl', default=None, group='url')
+
+ beeg_version, beeg_salt = [None] * 2
+
+ if cpl_url:
+ cpl = self._download_webpage(
+ self._proto_relative_url(cpl_url), video_id,
+ 'Downloading cpl JS', fatal=False)
+ if cpl:
+ beeg_version = self._search_regex(
+ r'beeg_version\s*=\s*(\d+)', cpl,
+ 'beeg version', default=None) or self._search_regex(
+ r'/(\d+)\.js', cpl_url, 'beeg version', default=None)
+ beeg_salt = self._search_regex(
+ r'beeg_salt\s*=\s*(["\'])(?P<beeg_salt>.+?)\1', cpl, 'beeg beeg_salt',
+ default=None, group='beeg_salt')
+
+ beeg_version = beeg_version or '1750'
+ beeg_salt = beeg_salt or 'MIDtGaw96f0N1kMMAM1DE46EC9pmFr'
+
video = self._download_json(
- 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id)
+ 'http://api.beeg.com/api/v6/%s/video/%s' % (beeg_version, video_id),
+ video_id)
def split(o, e):
def cut(s, x):
@@ -50,8 +75,8 @@ class BeegIE(InfoExtractor):
return n
def decrypt_key(key):
- # Reverse engineered from http://static.beeg.com/cpl/1105.js
- a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB'
+ # Reverse engineered from http://static.beeg.com/cpl/1738.js
+ a = beeg_salt
e = compat_urllib_parse_unquote(key)
o = ''.join([
compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21)
@@ -101,5 +126,5 @@ class BeegIE(InfoExtractor):
'duration': duration,
'tags': tags,
'formats': formats,
- 'age_limit': 18,
+ 'age_limit': self._rta_search(webpage),
}
diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py
index 1bdc258..9bca853 100644
--- a/youtube_dl/extractor/behindkink.py
+++ b/youtube_dl/extractor/behindkink.py
@@ -8,7 +8,7 @@ from ..utils import url_basename
class BehindKinkIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
+ _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)'
_TEST = {
'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/',
'md5': '507b57d8fdcd75a41a9a7bdb7989c762',
diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py
index 03dad46..bd3ee2e 100644
--- a/youtube_dl/extractor/bet.py
+++ b/youtube_dl/extractor/bet.py
@@ -1,31 +1,27 @@
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
-from ..utils import (
- xpath_text,
- xpath_with_ns,
- int_or_none,
- parse_iso8601,
-)
+from .mtv import MTVServicesInfoExtractor
+from ..utils import unified_strdate
+from ..compat import compat_urllib_parse_urlencode
-class BetIE(InfoExtractor):
+class BetIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [
{
'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',
'info_dict': {
- 'id': 'news/national/2014/a-conversation-with-president-obama',
+ 'id': '07e96bd3-8850-3051-b856-271b457f0ab8',
'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',
'ext': 'flv',
'title': 'A Conversation With President Obama',
- 'description': 'md5:699d0652a350cf3e491cd15cc745b5da',
+ 'description': 'President Obama urges persistence in confronting racism and bias.',
'duration': 1534,
- 'timestamp': 1418075340,
'upload_date': '20141208',
- 'uploader': 'admin',
'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ 'subtitles': {
+ 'en': 'mincount:2',
+ }
},
'params': {
# rtmp download
@@ -35,16 +31,17 @@ class BetIE(InfoExtractor):
{
'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',
'info_dict': {
- 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts',
+ 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9',
'display_id': 'justice-for-ferguson-a-community-reacts',
'ext': 'flv',
'title': 'Justice for Ferguson: A Community Reacts',
'description': 'A BET News special.',
'duration': 1696,
- 'timestamp': 1416942360,
'upload_date': '20141125',
- 'uploader': 'admin',
'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ 'subtitles': {
+ 'en': 'mincount:2',
+ }
},
'params': {
# rtmp download
@@ -53,56 +50,32 @@ class BetIE(InfoExtractor):
}
]
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- media_url = compat_urllib_parse_unquote(self._search_regex(
- [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"],
- webpage, 'media URL'))
+ _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
- video_id = self._search_regex(
- r'/video/(.*)/_jcr_content/', media_url, 'video id')
+ def _get_feed_query(self, uri):
+ return compat_urllib_parse_urlencode({
+ 'uuid': uri,
+ })
- mrss = self._download_xml(media_url, display_id)
-
- item = mrss.find('./channel/item')
-
- NS_MAP = {
- 'dc': 'http://purl.org/dc/elements/1.1/',
- 'media': 'http://search.yahoo.com/mrss/',
- 'ka': 'http://kickapps.com/karss',
- }
+ def _extract_mgid(self, webpage):
+ return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')
- title = xpath_text(item, './title', 'title')
- description = xpath_text(
- item, './description', 'description', fatal=False)
-
- timestamp = parse_iso8601(xpath_text(
- item, xpath_with_ns('./dc:date', NS_MAP),
- 'upload date', fatal=False))
- uploader = xpath_text(
- item, xpath_with_ns('./dc:creator', NS_MAP),
- 'uploader', fatal=False)
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
- media_content = item.find(
- xpath_with_ns('./media:content', NS_MAP))
- duration = int_or_none(media_content.get('duration'))
- smil_url = media_content.get('url')
+ webpage = self._download_webpage(url, display_id)
+ mgid = self._extract_mgid(webpage)
+ videos_info = self._get_videos_info(mgid)
- thumbnail = media_content.find(
- xpath_with_ns('./media:thumbnail', NS_MAP)).get('url')
+ info_dict = videos_info['entries'][0]
- formats = self._extract_smil_formats(smil_url, display_id)
+ upload_date = unified_strdate(self._html_search_meta('date', webpage))
+ description = self._html_search_meta('description', webpage)
- return {
- 'id': video_id,
+ info_dict.update({
'display_id': display_id,
- 'title': title,
'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'uploader': uploader,
- 'duration': duration,
- 'formats': formats,
- }
+ 'upload_date': upload_date,
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 59beb11..b17047b 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -1,34 +1,42 @@
# coding: utf-8
from __future__ import unicode_literals
+import calendar
+import datetime
import re
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+ compat_etree_fromstring,
+ compat_str,
+ compat_parse_qs,
+ compat_xml_parse_error,
+)
from ..utils import (
- int_or_none,
- unescapeHTML,
ExtractorError,
+ int_or_none,
+ float_or_none,
xpath_text,
)
class BiliBiliIE(InfoExtractor):
- _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?'
+ _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.bilibili.tv/video/av1074402/',
- 'md5': '2c301e4dab317596e837c3e7633e7d86',
+ 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
'info_dict': {
'id': '1554319',
'ext': 'flv',
'title': '【金坷垃】金泡沫',
- 'duration': 308313,
+ 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
+ 'duration': 308.067,
+ 'timestamp': 1398012660,
'upload_date': '20140420',
'thumbnail': 're:^https?://.+\.jpg',
- 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
- 'timestamp': 1397983878,
'uploader': '菊子桑',
+ 'uploader_id': '156160',
},
}, {
'url': 'http://www.bilibili.com/video/av1041170/',
@@ -36,75 +44,186 @@ class BiliBiliIE(InfoExtractor):
'id': '1041170',
'title': '【BD1080P】刀语【诸神&异域】',
'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~',
- 'uploader': '枫叶逝去',
- 'timestamp': 1396501299,
},
'playlist_count': 9,
+ }, {
+ 'url': 'http://www.bilibili.com/video/av4808130/',
+ 'info_dict': {
+ 'id': '4808130',
+ 'title': '【长篇】哆啦A梦443【钉铛】',
+ 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+ },
+ 'playlist': [{
+ 'md5': '55cdadedf3254caaa0d5d27cf20a8f9c',
+ 'info_dict': {
+ 'id': '4808130_part1',
+ 'ext': 'flv',
+ 'title': '【长篇】哆啦A梦443【钉铛】',
+ 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+ 'timestamp': 1464564180,
+ 'upload_date': '20160529',
+ 'uploader': '喜欢拉面',
+ 'uploader_id': '151066',
+ },
+ }, {
+ 'md5': '926f9f67d0c482091872fbd8eca7ea3d',
+ 'info_dict': {
+ 'id': '4808130_part2',
+ 'ext': 'flv',
+ 'title': '【长篇】哆啦A梦443【钉铛】',
+ 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+ 'timestamp': 1464564180,
+ 'upload_date': '20160529',
+ 'uploader': '喜欢拉面',
+ 'uploader_id': '151066',
+ },
+ }, {
+ 'md5': '4b7b225b968402d7c32348c646f1fd83',
+ 'info_dict': {
+ 'id': '4808130_part3',
+ 'ext': 'flv',
+ 'title': '【长篇】哆啦A梦443【钉铛】',
+ 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+ 'timestamp': 1464564180,
+ 'upload_date': '20160529',
+ 'uploader': '喜欢拉面',
+ 'uploader_id': '151066',
+ },
+ }, {
+ 'md5': '7b795e214166501e9141139eea236e91',
+ 'info_dict': {
+ 'id': '4808130_part4',
+ 'ext': 'flv',
+ 'title': '【长篇】哆啦A梦443【钉铛】',
+ 'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+ 'timestamp': 1464564180,
+ 'upload_date': '20160529',
+ 'uploader': '喜欢拉面',
+ 'uploader_id': '151066',
+ },
+ }],
+ }, {
+ # Missing upload time
+ 'url': 'http://www.bilibili.com/video/av1867637/',
+ 'info_dict': {
+ 'id': '2880301',
+ 'ext': 'flv',
+ 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】',
+ 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】',
+ 'uploader': '黑夜为猫',
+ 'uploader_id': '610729',
+ },
+ 'params': {
+ # Just to test metadata extraction
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['upload time'],
}]
+ # BiliBili blocks keys from time to time. The current key is extracted from
+ # the Android client
+ # TODO: find the sign algorithm used in the flash player
+ _APP_KEY = '86385cdc024c0f6c'
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- page_num = mobj.group('page_num') or '1'
- view_data = self._download_json(
- 'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num),
- video_id)
- if 'error' in view_data:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True)
+ webpage = self._download_webpage(url, video_id)
- cid = view_data['cid']
- title = unescapeHTML(view_data['title'])
+ params = compat_parse_qs(self._search_regex(
+ [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
+ r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
+ webpage, 'player parameters'))
+ cid = params['cid'][0]
- doc = self._download_xml(
- 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid,
- cid,
- 'Downloading page %s/%s' % (page_num, view_data['pages'])
- )
+ info_xml_str = self._download_webpage(
+ 'http://interface.bilibili.com/v_cdn_play',
+ cid, query={'appkey': self._APP_KEY, 'cid': cid},
+ note='Downloading video info page')
+
+ err_msg = None
+ durls = None
+ info_xml = None
+ try:
+ info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8'))
+ except compat_xml_parse_error:
+ info_json = self._parse_json(info_xml_str, video_id, fatal=False)
+ err_msg = (info_json or {}).get('error_text')
+ else:
+ err_msg = xpath_text(info_xml, './message')
- if xpath_text(doc, './result') == 'error':
- raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True)
+ if info_xml is not None:
+ durls = info_xml.findall('./durl')
+ if not durls:
+ if err_msg:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True)
+ else:
+ raise ExtractorError('No videos found!')
entries = []
- for durl in doc.findall('./durl'):
+ for durl in durls:
size = xpath_text(durl, ['./filesize', './size'])
formats = [{
'url': durl.find('./url').text,
'filesize': int_or_none(size),
- 'ext': 'flv',
}]
- backup_urls = durl.find('./backup_url')
- if backup_urls is not None:
- for backup_url in backup_urls.findall('./url'):
- formats.append({'url': backup_url.text})
- formats.reverse()
+ for backup_url in durl.findall('./backup_url/url'):
+ formats.append({
+ 'url': backup_url.text,
+ # backup URLs have lower priorities
+ 'preference': -2 if 'hd.mp4' in backup_url.text else -3,
+ })
+
+ self._sort_formats(formats)
entries.append({
'id': '%s_part%s' % (cid, xpath_text(durl, './order')),
- 'title': title,
'duration': int_or_none(xpath_text(durl, './length'), 1000),
'formats': formats,
})
+ title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
+ description = self._html_search_meta('description', webpage)
+ datetime_str = self._html_search_regex(
+ r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False)
+ timestamp = None
+ if datetime_str:
+ timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple())
+
+ # TODO 'view_count' requires deobfuscating Javascript
info = {
'id': compat_str(cid),
'title': title,
- 'description': view_data.get('description'),
- 'thumbnail': view_data.get('pic'),
- 'uploader': view_data.get('author'),
- 'timestamp': int_or_none(view_data.get('created')),
- 'view_count': int_or_none(view_data.get('play')),
- 'duration': int_or_none(xpath_text(doc, './timelength')),
+ 'description': description,
+ 'timestamp': timestamp,
+ 'thumbnail': self._html_search_meta('thumbnailUrl', webpage),
+ 'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000),
}
+ uploader_mobj = re.search(
+ r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
+ webpage)
+ if uploader_mobj:
+ info.update({
+ 'uploader': uploader_mobj.group('name'),
+ 'uploader_id': uploader_mobj.group('id'),
+ })
+
+ for entry in entries:
+ entry.update(info)
+
if len(entries) == 1:
- entries[0].update(info)
return entries[0]
else:
- info.update({
+ for idx, entry in enumerate(entries):
+ entry['id'] = '%s_part%d' % (video_id, (idx + 1))
+
+ return {
'_type': 'multi_video',
'id': video_id,
+ 'title': title,
+ 'description': description,
'entries': entries,
- })
- return info
+ }
diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py
new file mode 100644
index 0000000..1332281
--- /dev/null
+++ b/youtube_dl/extractor/biobiochiletv.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import remove_end
+
+
+class BioBioChileTVIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml'
+
+ _TESTS = [{
+ 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml',
+ 'md5': '26f51f03cf580265defefb4518faec09',
+ 'info_dict': {
+ 'id': 'sobre-camaras-y-camarillas-parlamentarias',
+ 'ext': 'mp4',
+ 'title': 'Sobre Cámaras y camarillas parlamentarias',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Fernando Atria',
+ },
+ }, {
+ # different uploader layout
+ 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml',
+ 'md5': 'edc2e6b58974c46d5b047dea3c539ff3',
+ 'info_dict': {
+ 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades',
+ 'ext': 'mp4',
+ 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Piangella Obrador',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV')
+
+ file_url = self._search_regex(
+ r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1',
+ webpage, 'file url', group='url')
+
+ base_url = self._search_regex(
+ r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage,
+ 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/',
+ group='url')
+
+ formats = self._extract_m3u8_formats(
+ '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+ f = {
+ 'url': '%s%s' % (base_url, file_url),
+ 'format_id': 'http',
+ 'protocol': 'http',
+ 'preference': 1,
+ }
+ if formats:
+ f_copy = formats[-1].copy()
+ f_copy.update(f)
+ f = f_copy
+ formats.append(f)
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+ uploader = self._html_search_regex(
+ r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>',
+ webpage, 'uploader', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py
new file mode 100644
index 0000000..ae4579b
--- /dev/null
+++ b/youtube_dl/extractor/biqle.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BIQLEIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
+ _TESTS = [{
+ 'url': 'http://www.biqle.ru/watch/847655_160197695',
+ 'md5': 'ad5f746a874ccded7b8f211aeea96637',
+ 'info_dict': {
+ 'id': '160197695',
+ 'ext': 'mp4',
+ 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)',
+ 'uploader': 'Andrey Rogozin',
+ 'upload_date': '20110605',
+ }
+ }, {
+ 'url': 'https://biqle.org/watch/-44781847_168547604',
+ 'md5': '7f24e72af1db0edf7c1aaba513174f97',
+ 'info_dict': {
+ 'id': '168547604',
+ 'ext': 'mp4',
+ 'title': 'Ребенок в шоке от автоматической мойки',
+ 'uploader': 'Dmitry Kotov',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ embed_url = self._proto_relative_url(self._search_regex(
+ r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url'))
+
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ }
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 38bda3a..7a8e1f6 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor):
'add_ie': ['Ooyala'],
}, {
'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo',
- 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50',
+ 'md5': '6a5cd403418c7b01719248ca97fb0692',
'info_dict': {
'id': '2586817',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
@@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE):
'md5': '8c2c12e3af7805152675446c905d159b',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py
index 13343bc..bd538be 100644
--- a/youtube_dl/extractor/bloomberg.py
+++ b/youtube_dl/extractor/bloomberg.py
@@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor):
'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
'description': 'md5:a8ba0302912d03d246979735c17d2761',
},
+ 'params': {
+ 'format': 'best[format_id^=hds]',
+ },
}, {
'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
'only_matching': True,
diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py
new file mode 100644
index 0000000..86a7f4d
--- /dev/null
+++ b/youtube_dl/extractor/bokecc.py
@@ -0,0 +1,60 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import ExtractorError
+
+
+class BokeCCBaseIE(InfoExtractor):
+ def _extract_bokecc_formats(self, webpage, video_id, format_id=None):
+ player_params_str = self._html_search_regex(
+ r'<(?:script|embed)[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
+ webpage, 'player params')
+
+ player_params = compat_parse_qs(player_params_str)
+
+ info_xml = self._download_xml(
+ 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
+ player_params['siteid'][0], player_params['vid'][0]), video_id)
+
+ formats = [{
+ 'format_id': format_id,
+ 'url': quality.find('./copy').attrib['playurl'],
+ 'preference': int(quality.attrib['value']),
+ } for quality in info_xml.findall('./video/quality')]
+
+ self._sort_formats(formats)
+
+ return formats
+
+
+class BokeCCIE(BokeCCBaseIE):
+ _IE_DESC = 'CC视频'
+ _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)'
+
+ _TESTS = [{
+ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B',
+ 'info_dict': {
+ 'id': 'CD0C5D3C8614B28B_E44D40C15E65EA30',
+ 'ext': 'flv',
+ 'title': 'BokeCC Video',
+ },
+ }]
+
+ def _real_extract(self, url):
+ qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ if not qs.get('vid') or not qs.get('uid'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ video_id = '%s_%s' % (qs['uid'][0], qs['vid'][0])
+
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': 'BokeCC Video', # no title provided in the webpage
+ 'formats': self._extract_bokecc_formats(webpage, video_id),
+ }
diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py
index c28e729..6ad45a1 100644
--- a/youtube_dl/extractor/bpb.py
+++ b/youtube_dl/extractor/bpb.py
@@ -12,7 +12,7 @@ from ..utils import (
class BpbIE(InfoExtractor):
IE_DESC = 'Bundeszentrale für politische Bildung'
- _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/'
_TEST = {
'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr',
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index 11cf498..ff0aa11 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -29,7 +29,8 @@ class BRIE(InfoExtractor):
'duration': 180,
'uploader': 'Reinhard Weber',
'upload_date': '20150422',
- }
+ },
+ 'skip': '404 not found',
},
{
'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html',
@@ -40,7 +41,8 @@ class BRIE(InfoExtractor):
'title': 'Manfred Schreiber ist tot',
'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97',
'duration': 26,
- }
+ },
+ 'skip': '404 not found',
},
{
'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html',
@@ -51,7 +53,8 @@ class BRIE(InfoExtractor):
'title': 'Kurzweilig und sehr bewegend',
'description': 'md5:0351996e3283d64adeb38ede91fac54e',
'duration': 296,
- }
+ },
+ 'skip': '404 not found',
},
{
'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py
new file mode 100644
index 0000000..541c769
--- /dev/null
+++ b/youtube_dl/extractor/bravotv.py
@@ -0,0 +1,31 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class BravoTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P<id>[^/?]+)'
+ _TEST = {
+ 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale',
+ 'md5': 'd60cdf68904e854fac669bd26cccf801',
+ 'info_dict': {
+ 'id': 'LitrBdX64qLn',
+ 'ext': 'mp4',
+ 'title': 'Last Chance Kitchen Returns',
+ 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13',
+ 'timestamp': 1448926740,
+ 'upload_date': '20151130',
+ 'uploader': 'NBCU-BRAV',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid')
+ release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid')
+ return self.url_result(smuggle_url(
+ 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid),
+ {'force_smil_url': True}), 'ThePlatform', release_pid)
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py
index aa08051..725859b 100644
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -11,7 +11,7 @@ from ..utils import (
class BreakIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056',
'info_dict': {
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index c947337..ef560b5 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -9,10 +9,10 @@ from ..compat import (
compat_etree_fromstring,
compat_parse_qs,
compat_str,
- compat_urllib_parse,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_xml_parse_error,
+ compat_HTTPError,
)
from ..utils import (
determine_ext,
@@ -23,16 +23,16 @@ from ..utils import (
js_to_json,
int_or_none,
parse_iso8601,
- sanitized_Request,
unescapeHTML,
unsmuggle_url,
+ update_url_query,
)
class BrightcoveLegacyIE(InfoExtractor):
IE_NAME = 'brightcove:legacy'
_VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
- _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
+ _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated'
_TESTS = [
{
@@ -46,6 +46,9 @@ class BrightcoveLegacyIE(InfoExtractor):
'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
'uploader': '8TV',
'description': 'md5:a950cc4285c43e44d763d036710cd9cd',
+ 'timestamp': 1368213670,
+ 'upload_date': '20130510',
+ 'uploader_id': '1589608506001',
}
},
{
@@ -57,6 +60,9 @@ class BrightcoveLegacyIE(InfoExtractor):
'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges',
'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.',
'uploader': 'Oracle',
+ 'timestamp': 1344975024,
+ 'upload_date': '20120814',
+ 'uploader_id': '1460825906',
},
},
{
@@ -68,6 +74,9 @@ class BrightcoveLegacyIE(InfoExtractor):
'title': 'This Bracelet Acts as a Personal Thermostat',
'description': 'md5:547b78c64f4112766ccf4e151c20b6a0',
'uploader': 'Mashable',
+ 'timestamp': 1382041798,
+ 'upload_date': '20131017',
+ 'uploader_id': '1130468786001',
},
},
{
@@ -85,14 +94,17 @@ class BrightcoveLegacyIE(InfoExtractor):
{
# test flv videos served by akamaihd.net
# From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william
- 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
+ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D',
# The md5 checksum changes on each download
'info_dict': {
- 'id': '2996102916001',
+ 'id': '3750436379001',
'ext': 'flv',
'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
- 'uploader': 'Red Bull TV',
+ 'uploader': 'RBTV Old (do not use)',
'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals',
+ 'timestamp': 1409122195,
+ 'upload_date': '20140827',
+ 'uploader_id': '710858724001',
},
},
{
@@ -106,6 +118,12 @@ class BrightcoveLegacyIE(InfoExtractor):
'playlist_mincount': 7,
},
]
+ FLV_VCODECS = {
+ 1: 'SORENSON',
+ 2: 'ON2',
+ 3: 'H264',
+ 4: 'VP8',
+ }
@classmethod
def _build_brighcove_url(cls, object_str):
@@ -136,13 +154,16 @@ class BrightcoveLegacyIE(InfoExtractor):
else:
flashvars = {}
+ data_url = object_doc.attrib.get('data', '')
+ data_url_params = compat_parse_qs(compat_urllib_parse_urlparse(data_url).query)
+
def find_param(name):
if name in flashvars:
return flashvars[name]
node = find_xpath_attr(object_doc, './param', 'name', name)
if node is not None:
return node.attrib['value']
- return None
+ return data_url_params.get(name)
params = {}
@@ -155,8 +176,8 @@ class BrightcoveLegacyIE(InfoExtractor):
# Not all pages define this value
if playerKey is not None:
params['playerKey'] = playerKey
- # The three fields hold the id of the video
- videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
+ # These fields hold the id of the video
+ videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
if videoPlayer is not None:
params['@videoPlayer'] = videoPlayer
linkBase = find_param('linkBaseURL')
@@ -184,8 +205,7 @@ class BrightcoveLegacyIE(InfoExtractor):
@classmethod
def _make_brightcove_url(cls, params):
- data = compat_urllib_parse.urlencode(params)
- return cls._FEDERATED_URL_TEMPLATE % data
+ return update_url_query(cls._FEDERATED_URL, params)
@classmethod
def _extract_brightcove_url(cls, webpage):
@@ -239,7 +259,7 @@ class BrightcoveLegacyIE(InfoExtractor):
# We set the original url as the default 'Referer' header
referer = smuggled_data.get('Referer', url)
return self._get_video_info(
- videoPlayer[0], query_str, query, referer=referer)
+ videoPlayer[0], query, referer=referer)
elif 'playerKey' in query:
player_key = query['playerKey']
return self._get_playlist_info(player_key[0])
@@ -248,15 +268,14 @@ class BrightcoveLegacyIE(InfoExtractor):
'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
expected=True)
- def _get_video_info(self, video_id, query_str, query, referer=None):
- request_url = self._FEDERATED_URL_TEMPLATE % query_str
- req = sanitized_Request(request_url)
+ def _get_video_info(self, video_id, query, referer=None):
+ headers = {}
linkBase = query.get('linkBaseURL')
if linkBase is not None:
referer = linkBase[0]
if referer is not None:
- req.add_header('Referer', referer)
- webpage = self._download_webpage(req, video_id)
+ headers['Referer'] = referer
+ webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query)
error_msg = self._html_search_regex(
r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,
@@ -288,15 +307,20 @@ class BrightcoveLegacyIE(InfoExtractor):
playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
def _extract_video_info(self, video_info):
+ video_id = compat_str(video_info['id'])
+ publisher_id = video_info.get('publisherId')
info = {
- 'id': compat_str(video_info['id']),
+ 'id': video_id,
'title': video_info['displayName'].strip(),
'description': video_info.get('shortDescription'),
'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
'uploader': video_info.get('publisherName'),
+ 'uploader_id': compat_str(publisher_id) if publisher_id else None,
+ 'duration': float_or_none(video_info.get('length'), 1000),
+ 'timestamp': int_or_none(video_info.get('creationDate'), 1000),
}
- renditions = video_info.get('renditions')
+ renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', [])
if renditions:
formats = []
for rend in renditions:
@@ -308,7 +332,8 @@ class BrightcoveLegacyIE(InfoExtractor):
url_comp = compat_urllib_parse_urlparse(url)
if url_comp.path.endswith('.m3u8'):
formats.extend(
- self._extract_m3u8_formats(url, info['id'], 'mp4'))
+ self._extract_m3u8_formats(
+ url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
continue
elif 'akamaihd.net' in url_comp.netloc:
# This type of renditions are served through
@@ -317,19 +342,42 @@ class BrightcoveLegacyIE(InfoExtractor):
ext = 'flv'
if ext is None:
ext = determine_ext(url)
- size = rend.get('size')
- formats.append({
+ tbr = int_or_none(rend.get('encodingRate'), 1000)
+ a_format = {
+ 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''),
'url': url,
'ext': ext,
- 'height': rend.get('frameHeight'),
- 'width': rend.get('frameWidth'),
- 'filesize': size if size != 0 else None,
- })
+ 'filesize': int_or_none(rend.get('size')) or None,
+ 'tbr': tbr,
+ }
+ if rend.get('audioOnly'):
+ a_format.update({
+ 'vcodec': 'none',
+ })
+ else:
+ a_format.update({
+ 'height': int_or_none(rend.get('frameHeight')),
+ 'width': int_or_none(rend.get('frameWidth')),
+ 'vcodec': rend.get('videoCodec'),
+ })
+
+ # m3u8 manifests with remote == false are media playlists
+ # Not calling _extract_m3u8_formats here to save network traffic
+ if ext == 'm3u8':
+ a_format.update({
+ 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''),
+ 'ext': 'mp4',
+ 'protocol': 'm3u8_native',
+ })
+
+ formats.append(a_format)
self._sort_formats(formats)
info['formats'] = formats
elif video_info.get('FLVFullLengthURL') is not None:
info.update({
'url': video_info['FLVFullLengthURL'],
+ 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')),
+ 'filesize': int_or_none(video_info.get('FLVFullSize')),
})
if self._downloader.params.get('include_ads', False):
@@ -349,13 +397,13 @@ class BrightcoveLegacyIE(InfoExtractor):
return ad_info
if 'url' not in info and not info.get('formats'):
- raise ExtractorError('Unable to extract video url for %s' % info['id'])
+ raise ExtractorError('Unable to extract video url for %s' % video_id)
return info
class BrightcoveNewIE(InfoExtractor):
IE_NAME = 'brightcove:new'
- _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)'
+ _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
'md5': 'c8100925723840d4b0d243f7025703be',
@@ -385,12 +433,21 @@ class BrightcoveNewIE(InfoExtractor):
'formats': 'mincount:41',
},
'params': {
+ # m3u8 download
'skip_download': True,
}
}, {
# ref: prefixed video id
'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',
'only_matching': True,
+ }, {
+ # non numeric ref: prefixed video id
+ 'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356',
+ 'only_matching': True,
+ }, {
+ # unavailable video without message but with error_code
+ 'url': 'http://players.brightcove.net/1305187701/c832abfb-641b-44eb-9da0-2fe76786505f_default/index.html?videoId=4377407326001',
+ 'only_matching': True,
}]
@staticmethod
@@ -410,8 +467,8 @@ class BrightcoveNewIE(InfoExtractor):
# Look for iframe embeds [1]
for _, url in re.findall(
- r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
- entries.append(url)
+ r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
+ entries.append(url if url.startswith('http') else 'http:' + url)
# Look for embed_in_page embeds [2]
for video_id, account_id, player_id, embed in re.findall(
@@ -420,11 +477,11 @@ class BrightcoveNewIE(InfoExtractor):
# According to [4] data-video-id may be prefixed with ref:
r'''(?sx)
<video[^>]+
- data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*?
+ data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
</video>.*?
<script[^>]+
src=["\'](?:https?:)?//players\.brightcove\.net/
- (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js
+ (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
''', webpage):
entries.append(
'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
@@ -454,24 +511,34 @@ class BrightcoveNewIE(InfoExtractor):
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
webpage, 'policy key', group='pk')
- req = sanitized_Request(
- 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s'
- % (account_id, video_id),
- headers={'Accept': 'application/json;pk=%s' % policy_key})
- json_data = self._download_json(req, video_id)
+ api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id)
+ try:
+ json_data = self._download_json(api_url, video_id, headers={
+ 'Accept': 'application/json;pk=%s' % policy_key
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ json_data = self._parse_json(e.cause.read().decode(), video_id)[0]
+ raise ExtractorError(
+ json_data.get('message') or json_data['error_code'], expected=True)
+ raise
- title = json_data['name']
+ title = json_data['name'].strip()
formats = []
for source in json_data.get('sources', []):
+ container = source.get('container')
source_type = source.get('type')
src = source.get('src')
- if source_type == 'application/x-mpegURL':
+ if source_type == 'application/x-mpegURL' or container == 'M2TS':
if not src:
continue
formats.extend(self._extract_m3u8_formats(
- src, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
+ src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ elif source_type == 'application/dash+xml':
+ if not src:
+ continue
+ formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
else:
streaming_src = source.get('streaming_src')
stream_name, app_name = source.get('stream_name'), source.get('app_name')
@@ -479,15 +546,23 @@ class BrightcoveNewIE(InfoExtractor):
continue
tbr = float_or_none(source.get('avg_bitrate'), 1000)
height = int_or_none(source.get('height'))
+ width = int_or_none(source.get('width'))
f = {
'tbr': tbr,
- 'width': int_or_none(source.get('width')),
- 'height': height,
'filesize': int_or_none(source.get('size')),
- 'container': source.get('container'),
- 'vcodec': source.get('codec'),
- 'ext': source.get('container').lower(),
+ 'container': container,
+ 'ext': container.lower(),
}
+ if width == 0 and height == 0:
+ f.update({
+ 'vcodec': 'none',
+ })
+ else:
+ f.update({
+ 'width': width,
+ 'height': height,
+ 'vcodec': source.get('codec'),
+ })
def build_format_id(kind):
format_id = kind
@@ -501,7 +576,7 @@ class BrightcoveNewIE(InfoExtractor):
f.update({
'url': src or streaming_src,
'format_id': build_format_id('http' if src else 'http-streaming'),
- 'preference': 2 if src else 1,
+ 'source_preference': 0 if src else -1,
})
else:
f.update({
@@ -512,20 +587,22 @@ class BrightcoveNewIE(InfoExtractor):
formats.append(f)
self._sort_formats(formats)
- description = json_data.get('description')
- thumbnail = json_data.get('thumbnail')
- timestamp = parse_iso8601(json_data.get('published_at'))
- duration = float_or_none(json_data.get('duration'), 1000)
- tags = json_data.get('tags', [])
+ subtitles = {}
+ for text_track in json_data.get('text_tracks', []):
+ if text_track.get('src'):
+ subtitles.setdefault(text_track.get('srclang'), []).append({
+ 'url': text_track['src'],
+ })
return {
'id': video_id,
'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'timestamp': timestamp,
+ 'description': json_data.get('description'),
+ 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
+ 'duration': float_or_none(json_data.get('duration'), 1000),
+ 'timestamp': parse_iso8601(json_data.get('published_at')),
'uploader_id': account_id,
'formats': formats,
- 'tags': tags,
+ 'subtitles': subtitles,
+ 'tags': json_data.get('tags', []),
}
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
index dda9805..3aec601 100644
--- a/youtube_dl/extractor/byutv.py
+++ b/youtube_dl/extractor/byutv.py
@@ -11,6 +11,7 @@ class BYUtvIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
_TEST = {
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
+ 'md5': '05850eb8c749e2ee05ad5a1c34668493',
'info_dict': {
'id': 'studio-c-season-5-episode-5',
'ext': 'mp4',
@@ -21,7 +22,8 @@ class BYUtvIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
+ 'add_ie': ['Ooyala'],
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py
index cb96c38..cac8fdc 100644
--- a/youtube_dl/extractor/c56.py
+++ b/youtube_dl/extractor/c56.py
@@ -4,12 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import js_to_json
class C56IE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
IE_NAME = '56.com'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
'md5': 'e59995ac63d0457783ea05f93f12a866',
'info_dict': {
@@ -18,12 +19,29 @@ class C56IE(InfoExtractor):
'title': '网事知多少 第32期:车怒',
'duration': 283.813,
},
- }
+ }, {
+ 'url': 'http://www.56.com/u47/v_MTM5NjQ5ODc2.html',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '82247482',
+ 'title': '爱的诅咒之杜鹃花开',
+ },
+ 'playlist_count': 7,
+ 'add_ie': ['Sohu'],
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
text_id = mobj.group('textid')
+ webpage = self._download_webpage(url, text_id)
+ sohu_video_info_str = self._search_regex(
+ r'var\s+sohuVideoInfo\s*=\s*({[^}]+});', webpage, 'Sohu video info', default=None)
+ if sohu_video_info_str:
+ sohu_video_info = self._parse_json(
+ sohu_video_info_str, text_id, transform_source=js_to_json)
+ return self.url_result(sohu_video_info['url'], 'Sohu')
+
page = self._download_json(
'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py
index 897f3a1..6ffbeab 100644
--- a/youtube_dl/extractor/camdemy.py
+++ b/youtube_dl/extractor/camdemy.py
@@ -6,7 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -16,7 +16,7 @@ from ..utils import (
class CamdemyIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)'
_TESTS = [{
# single file
'url': 'http://www.camdemy.com/media/5181/',
@@ -104,7 +104,7 @@ class CamdemyIE(InfoExtractor):
class CamdemyFolderIE(InfoExtractor):
- _VALID_URL = r'http://www.camdemy.com/folder/(?P<id>\d+)'
+ _VALID_URL = r'https?://www.camdemy.com/folder/(?P<id>\d+)'
_TESTS = [{
# links with trailing slash
'url': 'http://www.camdemy.com/folder/450',
@@ -139,7 +139,7 @@ class CamdemyFolderIE(InfoExtractor):
parsed_url = list(compat_urlparse.urlparse(url))
query = dict(compat_urlparse.parse_qsl(parsed_url[4]))
query.update({'displayMode': 'list'})
- parsed_url[4] = compat_urllib_parse.urlencode(query)
+ parsed_url[4] = compat_urllib_parse_urlencode(query)
final_url = compat_urlparse.urlunparse(parsed_url)
page = self._download_webpage(final_url, folder_id)
diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py
new file mode 100644
index 0000000..afbc5ea
--- /dev/null
+++ b/youtube_dl/extractor/camwithher.py
@@ -0,0 +1,87 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ unified_strdate,
+)
+
+
+class CamWithHerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P<id>\w+)'
+
+ _TESTS = [{
+ 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=',
+ 'info_dict': {
+ 'id': '5644',
+ 'ext': 'flv',
+ 'title': 'Periscope Tease',
+ 'description': 'In the clouds teasing on periscope to my favorite song',
+ 'duration': 240,
+ 'view_count': int,
+ 'comment_count': int,
+ 'uploader': 'MileenaK',
+ 'upload_date': '20160322',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ flv_id = self._html_search_regex(
+ r'<a[^>]+href=["\']/download/\?v=(\d+)', webpage, 'video id')
+
+ # Video URL construction algorithm is reverse-engineered from cwhplayer.swf
+ rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % (
+ ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id)
+
+ title = self._html_search_regex(
+ r'<div[^>]+style="float:left"[^>]*>\s*<h2>(.+?)</h2>', webpage, 'title')
+ description = self._html_search_regex(
+ r'>Description:</span>(.+?)</div>', webpage, 'description', default=None)
+
+ runtime = self._search_regex(
+ r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None)
+ if runtime:
+ runtime = re.sub(r'[\s-]', '', runtime)
+ duration = parse_duration(runtime)
+ view_count = int_or_none(self._search_regex(
+ r'Views\s*:\s*(\d+)', webpage, 'view count', default=None))
+ comment_count = int_or_none(self._search_regex(
+ r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None))
+
+ uploader = self._search_regex(
+ r'Added by\s*:\s*<a[^>]+>([^<]+)</a>', webpage, 'uploader', default=None)
+ upload_date = unified_strdate(self._search_regex(
+ r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None))
+
+ return {
+ 'id': flv_id,
+ 'url': rtmp_url,
+ 'ext': 'flv',
+ 'no_resume': True,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'comment_count': comment_count,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 25b2d4e..61463f2 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -4,11 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
HEADRequest,
unified_strdate,
- url_basename,
qualities,
int_or_none,
)
@@ -16,24 +16,38 @@ from ..utils import (
class CanalplusIE(InfoExtractor):
IE_DESC = 'canalplus.fr, piwiplus.fr and d8.tv'
- _VALID_URL = r'https?://(?:www\.(?P<site>canalplus\.fr|piwiplus\.fr|d8\.tv|itele\.fr)/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:
+ (?:(?:www|m)\.)?canalplus\.fr|
+ (?:www\.)?piwiplus\.fr|
+ (?:www\.)?d8\.tv|
+ (?:www\.)?d17\.tv|
+ (?:www\.)?itele\.fr
+ )/(?:(?:[^/]+/)*(?P<display_id>[^/?#&]+))?(?:\?.*\bvid=(?P<vid>\d+))?|
+ player\.canalplus\.fr/#/(?P<id>\d+)
+ )
+
+ '''
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/%s/%s?format=json'
_SITE_ID_MAP = {
- 'canalplus.fr': 'cplus',
- 'piwiplus.fr': 'teletoon',
- 'd8.tv': 'd8',
- 'itele.fr': 'itele',
+ 'canalplus': 'cplus',
+ 'piwiplus': 'teletoon',
+ 'd8': 'd8',
+ 'd17': 'd17',
+ 'itele': 'itele',
}
_TESTS = [{
- 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092',
- 'md5': '12164a6f14ff6df8bd628e8ba9b10b78',
+ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
+ 'md5': '41f438a4904f7664b91b4ed0dec969dc',
'info_dict': {
- 'id': '1263092',
+ 'id': '1192814',
'ext': 'mp4',
- 'title': 'Le Zapping - 13/05/15',
- 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f',
- 'upload_date': '20150513',
+ 'title': "L'Année du Zapping 2014 - L'Année du Zapping 2014",
+ 'description': "Toute l'année 2014 dans un Zapping exceptionnel !",
+ 'upload_date': '20150105',
},
}, {
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
@@ -46,35 +60,45 @@ class CanalplusIE(InfoExtractor):
},
'skip': 'Only works from France',
}, {
- 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+ 'url': 'http://www.d8.tv/d8-docs-mags/pid5198-d8-en-quete-d-actualite.html?vid=1390231',
'info_dict': {
- 'id': '966289',
- 'ext': 'flv',
- 'title': 'Campagne intime - Documentaire exceptionnel',
- 'description': 'md5:d2643b799fb190846ae09c61e59a859f',
- 'upload_date': '20131108',
+ 'id': '1390231',
+ 'ext': 'mp4',
+ 'title': "Vacances pas chères : prix discount ou grosses dépenses ? - En quête d'actualité",
+ 'description': 'md5:edb6cf1cb4a1e807b5dd089e1ac8bfc6',
+ 'upload_date': '20160512',
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'skip': 'videos get deleted after a while',
}, {
- 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559',
- 'md5': '38b8f7934def74f0d6f3ba6c036a5f82',
+ 'url': 'http://www.itele.fr/chroniques/invite-bruce-toussaint/thierry-solere-nicolas-sarkozy-officialisera-sa-candidature-a-la-primaire-quand-il-le-voudra-167224',
'info_dict': {
- 'id': '1213714',
+ 'id': '1398334',
'ext': 'mp4',
- 'title': 'Aubervilliers : un lycée en colère - Le 11/02/2015 à 06h45',
- 'description': 'md5:8216206ec53426ea6321321f3b3c16db',
- 'upload_date': '20150211',
+ 'title': "L'invité de Bruce Toussaint du 07/06/2016 - ",
+ 'description': 'md5:40ac7c9ad0feaeb6f605bad986f61324',
+ 'upload_date': '20160607',
},
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://m.canalplus.fr/?vid=1398231',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.d17.tv/emissions/pid8303-lolywood.html?vid=1397061',
+ 'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.groupdict().get('id')
+ video_id = mobj.groupdict().get('id') or mobj.groupdict().get('vid')
- site_id = self._SITE_ID_MAP[mobj.group('site') or 'canal']
+ site_id = self._SITE_ID_MAP[compat_urllib_parse_urlparse(url).netloc.rsplit('.', 2)[-2]]
# Beware, some subclasses do not define an id group
- display_id = url_basename(mobj.group('path'))
+ display_id = mobj.group('display_id') or video_id
if video_id is None:
webpage = self._download_webpage(url, display_id)
diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py
new file mode 100644
index 0000000..5797fb9
--- /dev/null
+++ b/youtube_dl/extractor/carambatv.py
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ try_get,
+)
+
+
+class CarambaTVIE(InfoExtractor):
+ _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://video1.carambatv.ru/v/191910501',
+ 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a',
+ 'info_dict': {
+ 'id': '191910501',
+ 'ext': 'mp4',
+ 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 2678.31,
+ },
+ }, {
+ 'url': 'carambatv:191910501',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id,
+ video_id)
+
+ title = video['title']
+
+ base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id
+
+ formats = [{
+ 'url': base_url + f['fn'],
+ 'height': int_or_none(f.get('height')),
+ 'format_id': '%sp' % f['height'] if f.get('height') else None,
+ } for f in video['qualities'] if f.get('fn')]
+ self._sort_formats(formats)
+
+ thumbnail = video.get('splash')
+ duration = float_or_none(try_get(
+ video, lambda x: x['annotations'][0]['end_time'], compat_str))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
+
+
+class CarambaTVPageIE(InfoExtractor):
+ _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '191910501',
+ 'ext': 'mp4',
+ 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 2678.31,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._og_search_property('video:iframe', webpage, default=None)
+
+ if not video_url:
+ video_id = self._search_regex(
+ r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)',
+ webpage, 'video id')
+ video_url = 'carambatv:%s' % video_id
+
+ return self.url_result(video_url, CarambaTVIE.ie_key())
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
index d8aa310..ff663d0 100644
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -4,64 +4,66 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+ js_to_json,
+ smuggle_url,
+)
class CBCIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:[^/]+/)+(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
# with mediaId
'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
+ 'md5': '97e24d09672fc4cf56256d6faa6c25bc',
'info_dict': {
'id': '2682904050',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Don Cherry – All-Stars',
'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guy’s got heart.',
- 'timestamp': 1454475540,
+ 'timestamp': 1454463000,
'upload_date': '20160203',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
+ 'uploader': 'CBCC-NEW',
},
}, {
# with clipId
'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
+ 'md5': '0274a90b51a9b4971fe005c63f592f12',
'info_dict': {
'id': '2487345465',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Robin Williams freestyles on 90 Minutes Live',
'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
- 'upload_date': '19700101',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
+ 'upload_date': '19780210',
+ 'uploader': 'CBCC-NEW',
+ 'timestamp': 255977160,
},
}, {
# multiple iframes
'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
'playlist': [{
+ 'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
'info_dict': {
'id': '2680832926',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
- 'upload_date': '19700101',
+ 'upload_date': '20160201',
+ 'timestamp': 1454342820,
+ 'uploader': 'CBCC-NEW',
},
}, {
+ 'md5': '415a0e3f586113894174dfb31aa5bb1a',
'info_dict': {
'id': '2658915080',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Fly like an eagle!',
'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
- 'upload_date': '19700101',
+ 'upload_date': '20150315',
+ 'timestamp': 1426443984,
+ 'uploader': 'CBCC-NEW',
},
}],
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
}]
@classmethod
@@ -90,24 +92,54 @@ class CBCIE(InfoExtractor):
class CBCPlayerIE(InfoExtractor):
_VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.cbc.ca/player/play/2683190193',
+ 'md5': '64d25f841ddf4ddb28a235338af32e2c',
'info_dict': {
'id': '2683190193',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Gerry Runs a Sweat Shop',
'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
- 'timestamp': 1455067800,
+ 'timestamp': 1455071400,
'upload_date': '20160210',
+ 'uploader': 'CBCC-NEW',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
+ }, {
+ # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
+ 'url': 'http://www.cbc.ca/player/play/2657631896',
+ 'md5': 'e5e708c34ae6fca156aafe17c43e8b75',
+ 'info_dict': {
+ 'id': '2657631896',
+ 'ext': 'mp3',
+ 'title': 'CBC Montreal is organizing its first ever community hackathon!',
+ 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.',
+ 'timestamp': 1425704400,
+ 'upload_date': '20150307',
+ 'uploader': 'CBCC-NEW',
},
- }
+ }, {
+ # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
+ 'url': 'http://www.cbc.ca/player/play/2164402062',
+ 'md5': '17a61eb813539abea40618d6323a7f82',
+ 'info_dict': {
+ 'id': '2164402062',
+ 'ext': 'flv',
+ 'title': 'Cancer survivor four times over',
+ 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
+ 'timestamp': 1320410746,
+ 'upload_date': '20111104',
+ 'uploader': 'CBCC-NEW',
+ },
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- return self.url_result(
- 'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id,
- 'ThePlatformFeed', video_id)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(
+ 'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true&formats=MPEG4,FLV,MP3' % video_id, {
+ 'force_smil_url': True
+ }),
+ 'id': video_id,
+ }
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index 40d07ab..a23173d 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -1,44 +1,53 @@
from __future__ import unicode_literals
-from .common import InfoExtractor
+from .theplatform import ThePlatformFeedIE
from ..utils import (
- sanitized_Request,
- smuggle_url,
+ int_or_none,
+ find_xpath_attr,
)
-class CBSIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)'
+class CBSBaseIE(ThePlatformFeedIE):
+ def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+ closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
+ return {
+ 'en': [{
+ 'ext': 'ttml',
+ 'url': closed_caption_e.attrib['value'],
+ }]
+ } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
+
+ def _extract_video_info(self, filter_query, video_id):
+ return self._extract_feed_info(
+ 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: {
+ 'series': entry.get('cbs$SeriesTitle'),
+ 'season_number': int_or_none(entry.get('cbs$SeasonNumber')),
+ 'episode': entry.get('cbs$EpisodeTitle'),
+ 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')),
+ }, {
+ 'StreamPack': {
+ 'manifest': 'm3u',
+ }
+ })
+
+
+class CBSIE(CBSBaseIE):
+ _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
_TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
- 'id': '4JUVEwq3wUT7',
+ 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
'display_id': 'connect-chat-feat-garth-brooks',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
'duration': 1495,
+ 'timestamp': 1385585425,
+ 'upload_date': '20131127',
+ 'uploader': 'CBSI-NEW',
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
- '_skip': 'Blocked outside the US',
- }, {
- 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/',
- 'info_dict': {
- 'id': 'WWF_5KqY3PK1',
- 'display_id': 'st-vincent',
- 'ext': 'flv',
- 'title': 'Live on Letterman - St. Vincent',
- 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.',
- 'duration': 3221,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
+ 'expected_warnings': ['Failed to download m3u8 information'],
'_skip': 'Blocked outside the US',
}, {
'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
@@ -47,22 +56,8 @@ class CBSIE(InfoExtractor):
'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
'only_matching': True,
}]
+ TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
def _real_extract(self, url):
- display_id = self._match_id(url)
- request = sanitized_Request(url)
- # Android UA is served with higher quality (720p) streams (see
- # https://github.com/rg3/youtube-dl/issues/7490)
- request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)')
- webpage = self._download_webpage(request, display_id)
- real_id = self._search_regex(
- [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"],
- webpage, 'real video ID')
- return {
- '_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
- 'url': smuggle_url(
- 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id,
- {'force_smil_url': True}),
- 'display_id': display_id,
- }
+ content_id = self._match_id(url)
+ return self._extract_video_info('byGuid=%s' % content_id, content_id)
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cbsinteractive.py
index 5c3908f..0011c30 100644
--- a/youtube_dl/extractor/cnet.py
+++ b/youtube_dl/extractor/cbsinteractive.py
@@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .theplatform import ThePlatformIE
from ..utils import int_or_none
-class CNETIE(ThePlatformIE):
- _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
+class CBSInteractiveIE(ThePlatformIE):
+ _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video/share)/(?P<id>[^/?]+)'
_TESTS = [{
'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
'info_dict': {
@@ -17,6 +19,8 @@ class CNETIE(ThePlatformIE):
'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861',
'uploader': 'Sarah Mitroff',
'duration': 70,
+ 'timestamp': 1396479627,
+ 'upload_date': '20140402',
},
}, {
'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/',
@@ -28,15 +32,38 @@ class CNETIE(ThePlatformIE):
'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40',
'uploader': 'Ashley Esqueda',
'duration': 1482,
+ 'timestamp': 1433289889,
+ 'upload_date': '20150603',
+ },
+ }, {
+ 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/',
+ 'info_dict': {
+ 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0',
+ 'ext': 'mp4',
+ 'title': 'Video: Keeping Android smartphones and tablets secure',
+ 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.',
+ 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0',
+ 'uploader': 'Adrian Kingsley-Hughes',
+ 'timestamp': 1448961720,
+ 'upload_date': '20151201',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
}]
+ TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true'
+ MPX_ACCOUNTS = {
+ 'cnet': 2288573011,
+ 'zdnet': 2387448114,
+ }
def _real_extract(self, url):
- display_id = self._match_id(url)
+ site, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
data_json = self._html_search_regex(
- r"data-cnet-video(?:-uvp)?-options='([^']+)'",
+ r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'",
webpage, 'data json')
data = self._parse_json(data_json, display_id)
vdata = data.get('video') or data['videos'][0]
@@ -51,18 +78,15 @@ class CNETIE(ThePlatformIE):
uploader = None
uploader_id = None
- mpx_account = data['config']['uvpConfig']['default']['mpx_account']
-
- metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id)
- description = vdata.get('description') or metadata.get('description')
- duration = int_or_none(vdata.get('duration')) or metadata.get('duration')
-
- formats = []
- subtitles = {}
+ media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])
+ formats, subtitles = [], {}
+ if site == 'cnet':
+ formats, subtitles = self._extract_theplatform_smil(
+ self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id)
for (fkey, vid) in vdata['files'].items():
if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
continue
- release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid)
+ release_url = self.TP_RELEASE_URL_TEMPLATE % vid
if fkey == 'hds':
release_url += '&manifest=f4m'
tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey)
@@ -70,15 +94,15 @@ class CNETIE(ThePlatformIE):
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
self._sort_formats(formats)
- return {
+ info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id)
+ info.update({
'id': video_id,
'display_id': display_id,
'title': title,
- 'description': description,
- 'thumbnail': metadata.get('thumbnail'),
- 'duration': duration,
+ 'duration': int_or_none(vdata.get('duration')),
'uploader': uploader,
'uploader_id': uploader_id,
'subtitles': subtitles,
'formats': formats,
- }
+ })
+ return info
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py
new file mode 100644
index 0000000..74adb38
--- /dev/null
+++ b/youtube_dl/extractor/cbslocal.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import datetime
+
+from .anvato import AnvatoIE
+from .sendtonews import SendtoNewsIE
+from ..compat import compat_urlparse
+
+
+class CBSLocalIE(AnvatoIE):
+ _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/\d+/\d+/\d+/(?P<id>[0-9a-z-]+)'
+
+ _TESTS = [{
+ # Anvato backend
+ 'url': 'http://losangeles.cbslocal.com/2016/05/16/safety-advocates-say-fatal-car-seat-failures-are-public-health-crisis',
+ 'md5': 'f0ee3081e3843f575fccef901199b212',
+ 'info_dict': {
+ 'id': '3401037',
+ 'ext': 'mp4',
+ 'title': 'Safety Advocates Say Fatal Car Seat Failures Are \'Public Health Crisis\'',
+ 'description': 'Collapsing seats have been the focus of scrutiny for decades, though experts say remarkably little has been done to address the issue. Randy Paige reports.',
+ 'thumbnail': 're:^https?://.*',
+ 'timestamp': 1463440500,
+ 'upload_date': '20160516',
+ 'subtitles': {
+ 'en': 'mincount:5',
+ },
+ 'categories': [
+ 'Stations\\Spoken Word\\KCBSTV',
+ 'Syndication\\MSN',
+ 'Syndication\\NDN',
+ 'Syndication\\AOL',
+ 'Syndication\\Yahoo',
+ 'Syndication\\Tribune',
+ 'Syndication\\Curb.tv',
+ 'Content\\News'
+ ],
+ },
+ }, {
+ # SendtoNews embed
+ 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
+ 'info_dict': {
+ 'id': 'GxfCe0Zo7D-175909-5588',
+ 'ext': 'mp4',
+ 'title': 'Recap: CLE 15, CIN 6',
+ 'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
+ 'upload_date': '20160516',
+ 'timestamp': 1463433840,
+ 'duration': 49,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ sendtonews_url = SendtoNewsIE._extract_url(webpage)
+ if sendtonews_url:
+ info_dict = {
+ '_type': 'url_transparent',
+ 'url': compat_urlparse.urljoin(url, sendtonews_url),
+ }
+ else:
+ info_dict = self._extract_anvato_videos(webpage, display_id)
+
+ time_str = self._html_search_regex(
+ r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False)
+ timestamp = None
+ if time_str:
+ timestamp = calendar.timegm(datetime.datetime.strptime(
+ time_str, '%b %d, %Y %I:%M %p').timetuple())
+
+ info_dict.update({
+ 'display_id': display_id,
+ 'timestamp': timestamp,
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 7319ee1..387537e 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -2,16 +2,15 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from .theplatform import ThePlatformIE
+from .cbs import CBSBaseIE
from ..utils import (
parse_duration,
- find_xpath_attr,
)
-class CBSNewsIE(ThePlatformIE):
+class CBSNewsIE(CBSBaseIE):
IE_DESC = 'CBS News'
- _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'
_TESTS = [
{
@@ -31,9 +30,12 @@ class CBSNewsIE(ThePlatformIE):
{
'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
'info_dict': {
- 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
+ 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y',
'ext': 'mp4',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
+ 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
+ 'upload_date': '19700101',
+ 'uploader': 'CBSI-NEW',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 205,
'subtitles': {
@@ -49,15 +51,6 @@ class CBSNewsIE(ThePlatformIE):
},
]
- def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
- closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL')
- return {
- 'en': [{
- 'ext': 'ttml',
- 'url': closed_caption_e.attrib['value'],
- }]
- } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
-
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -68,35 +61,13 @@ class CBSNewsIE(ThePlatformIE):
webpage, 'video JSON info'), video_id)
item = video_info['item'] if 'item' in video_info else video_info
- title = item.get('articleTitle') or item.get('hed')
- duration = item.get('duration')
- thumbnail = item.get('mediaImage') or item.get('thumbnail')
-
- subtitles = {}
- formats = []
- for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
- pid = item.get('media' + format_id)
- if not pid:
- continue
- release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid
- tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid)
- formats.extend(tp_formats)
- subtitles = self._merge_subtitles(subtitles, tp_subtitles)
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- 'subtitles': subtitles,
- }
+ guid = item['mpxRefId']
+ return self._extract_video_info('byGuid=%s' % guid, guid)
class CBSNewsLiveVideoIE(InfoExtractor):
IE_DESC = 'CBS News Live Videos'
- _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'
+ _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'
_TEST = {
'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
@@ -122,6 +93,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
for entry in f4m_formats:
# URLs without the extra param induce an 404 error
entry.update({'extra_param_to_segment_url': hdcore_sign})
+ self._sort_formats(f4m_formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py
index ae47e74..78ca44b 100644
--- a/youtube_dl/extractor/cbssports.py
+++ b/youtube_dl/extractor/cbssports.py
@@ -1,30 +1,28 @@
from __future__ import unicode_literals
-import re
+from .cbs import CBSBaseIE
-from .common import InfoExtractor
+class CBSSportsIE(CBSBaseIE):
+ _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P<id>\d+)'
-class CBSSportsIE(InfoExtractor):
- _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)'
-
- _TEST = {
- 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s',
+ _TESTS = [{
+ 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast',
'info_dict': {
- 'id': '_d5_GbO8p1sT',
- 'ext': 'flv',
- 'title': 'US Open flashbacks: 1990s',
- 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.',
+ 'id': '708337219968',
+ 'ext': 'mp4',
+ 'title': 'Ben Simmons the next LeBron? Not so fast',
+ 'description': 'md5:854294f627921baba1f4b9a990d87197',
+ 'timestamp': 1466293740,
+ 'upload_date': '20160618',
+ 'uploader': 'CBSI-NEW',
},
- }
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- section = mobj.group('section')
- video_id = mobj.group('id')
- all_videos = self._download_json(
- 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section,
- video_id)
- # The json file contains the info of all the videos in the section
- video_info = next(v for v in all_videos if v['pcid'] == video_id)
- return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform')
+ video_id = self._match_id(url)
+ return self._extract_video_info('byId=%s' % video_id, video_id)
diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py
index dda2c09..8f7f09e 100644
--- a/youtube_dl/extractor/ccc.py
+++ b/youtube_dl/extractor/ccc.py
@@ -1,13 +1,9 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
int_or_none,
- parse_duration,
- qualities,
- unified_strdate,
+ parse_iso8601,
)
@@ -19,14 +15,14 @@ class CCCIE(InfoExtractor):
'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video',
'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': {
- 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor',
+ 'id': '1839',
'ext': 'mp4',
'title': 'Introduction to Processor Design',
- 'description': 'md5:80be298773966f66d56cb11260b879af',
+ 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac',
'thumbnail': 're:^https?://.*\.jpg$',
- 'view_count': int,
'upload_date': '20131228',
- 'duration': 3660,
+ 'timestamp': 1388188800,
+ 'duration': 3710,
}
}, {
'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download',
@@ -34,79 +30,48 @@ class CCCIE(InfoExtractor):
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- if self._downloader.params.get('prefer_free_formats'):
- preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd'])
- else:
- preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd'])
-
- title = self._html_search_regex(
- r'(?s)<h1>(.*?)</h1>', webpage, 'title')
- description = self._html_search_regex(
- r'(?s)<h3>About</h3>(.+?)<h3>',
- webpage, 'description', fatal=False)
- upload_date = unified_strdate(self._html_search_regex(
- r"(?s)<span[^>]+class='[^']*fa-calendar-o'[^>]*>(.+?)</span>",
- webpage, 'upload date', fatal=False))
- view_count = int_or_none(self._html_search_regex(
- r"(?s)<span class='[^']*fa-eye'></span>(.*?)</li>",
- webpage, 'view count', fatal=False))
- duration = parse_duration(self._html_search_regex(
- r'(?s)<span[^>]+class=(["\']).*?fa-clock-o.*?\1[^>]*></span>(?P<duration>.+?)</li',
- webpage, 'duration', fatal=False, group='duration'))
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id')
+ event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id)
- matches = re.finditer(r'''(?xs)
- <(?:span|div)\s+class='label\s+filetype'>(?P<format>[^<]*)</(?:span|div)>\s*
- <(?:span|div)\s+class='label\s+filetype'>(?P<lang>[^<]*)</(?:span|div)>\s*
- <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
- (?:
- .*?
- <a\s+(?:download\s+)?href='(?P<torrent_url>[^']+\.torrent)'
- )?''', webpage)
formats = []
- for m in matches:
- format = m.group('format')
- format_id = self._search_regex(
- r'.*/([a-z0-9_-]+)/[^/]*$',
- m.group('http_url'), 'format id', default=None)
- if format_id:
- format_id = m.group('lang') + '-' + format_id
- vcodec = 'h264' if 'h264' in format_id else (
- 'none' if format_id in ('mp3', 'opus') else None
+ for recording in event_data.get('recordings', []):
+ recording_url = recording.get('recording_url')
+ if not recording_url:
+ continue
+ language = recording.get('language')
+ folder = recording.get('folder')
+ format_id = None
+ if language:
+ format_id = language
+ if folder:
+ if language:
+ format_id += '-' + folder
+ else:
+ format_id = folder
+ vcodec = 'h264' if 'h264' in folder else (
+ 'none' if folder in ('mp3', 'opus') else None
)
formats.append({
'format_id': format_id,
- 'format': format,
- 'language': m.group('lang'),
- 'url': m.group('http_url'),
+ 'url': recording_url,
+ 'width': int_or_none(recording.get('width')),
+ 'height': int_or_none(recording.get('height')),
+ 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024),
+ 'language': language,
'vcodec': vcodec,
- 'preference': preference(format_id),
})
-
- if m.group('torrent_url'):
- formats.append({
- 'format_id': 'torrent-%s' % (format if format_id is None else format_id),
- 'format': '%s (torrent)' % format,
- 'proto': 'torrent',
- 'format_note': '(unsupported; will just download the .torrent file)',
- 'vcodec': vcodec,
- 'preference': -100 + preference(format_id),
- 'url': m.group('torrent_url'),
- })
self._sort_formats(formats)
- thumbnail = self._html_search_regex(
- r"<video.*?poster='([^']+)'", webpage, 'thumbnail', fatal=False)
-
return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'view_count': view_count,
- 'upload_date': upload_date,
- 'duration': duration,
+ 'id': event_id,
+ 'display_id': display_id,
+ 'title': event_data['title'],
+ 'description': event_data.get('description'),
+ 'thumbnail': event_data.get('thumb_url'),
+ 'timestamp': parse_iso8601(event_data.get('date')),
+ 'duration': int_or_none(event_data.get('length')),
+ 'tags': event_data.get('tags'),
'formats': formats,
}
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
new file mode 100755
index 0000000..8af3187
--- /dev/null
+++ b/youtube_dl/extractor/cda.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ decode_packed_codes,
+ ExtractorError,
+ parse_duration
+)
+
+
+class CDAIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
+ _TESTS = [{
+ 'url': 'http://www.cda.pl/video/5749950c',
+ 'md5': '6f844bf51b15f31fae165365707ae970',
+ 'info_dict': {
+ 'id': '5749950c',
+ 'ext': 'mp4',
+ 'height': 720,
+ 'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
+ 'duration': 39
+ }
+ }, {
+ 'url': 'http://www.cda.pl/video/57413289',
+ 'md5': 'a88828770a8310fc00be6c95faf7f4d5',
+ 'info_dict': {
+ 'id': '57413289',
+ 'ext': 'mp4',
+ 'title': 'Lądowanie na lotnisku na Maderze',
+ 'duration': 137
+ }
+ }, {
+ 'url': 'http://ebd.cda.pl/0x0/5749950c',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id)
+
+ if 'Ten film jest dostępny dla użytkowników premium' in webpage:
+ raise ExtractorError('This video is only available for premium users.', expected=True)
+
+ title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
+
+ formats = []
+
+ info_dict = {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'duration': None,
+ }
+
+ def extract_format(page, version):
+ unpacked = decode_packed_codes(page)
+ format_url = self._search_regex(
+ r"(?:file|url)\s*:\s*(\\?[\"'])(?P<url>http.+?)\1", unpacked,
+ '%s url' % version, fatal=False, group='url')
+ if not format_url:
+ return
+ f = {
+ 'url': format_url,
+ }
+ m = re.search(
+ r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p',
+ page)
+ if m:
+ f.update({
+ 'format_id': m.group('format_id'),
+ 'height': int(m.group('height')),
+ })
+ info_dict['formats'].append(f)
+ if not info_dict['duration']:
+ info_dict['duration'] = parse_duration(self._search_regex(
+ r"duration\s*:\s*(\\?[\"'])(?P<duration>.+?)\1",
+ unpacked, 'duration', fatal=False, group='duration'))
+
+ extract_format(webpage, 'default')
+
+ for href, resolution in re.findall(
+ r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
+ webpage):
+ webpage = self._download_webpage(
+ href, video_id, 'Downloading %s version information' % resolution, fatal=False)
+ if not webpage:
+ # Manually report warning because empty page is returned when
+ # invalid version is requested.
+ self.report_warning('Unable to download %s version information' % resolution)
+ continue
+ extract_format(webpage, resolution)
+
+ self._sort_formats(formats)
+
+ return info_dict
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index b27b4e6..5a58d17 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -5,7 +5,6 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
@@ -13,6 +12,7 @@ from ..utils import (
ExtractorError,
float_or_none,
sanitized_Request,
+ urlencode_postdata,
)
@@ -33,20 +33,34 @@ class CeskaTelevizeIE(InfoExtractor):
'skip_download': True,
},
}, {
- 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en',
'info_dict': {
- 'id': '61924494876844374',
+ 'id': '61924494877028507',
'ext': 'mp4',
- 'title': 'První republika: Zpěvačka z Dupárny Bobina',
- 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.',
+ 'title': 'Hyde Park Civilizace: Bonus 01 - En',
+ 'description': 'English Subtittles',
'thumbnail': 're:^https?://.*\.jpg',
- 'duration': 88.4,
+ 'duration': 81.3,
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
+ # live stream
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/',
+ 'info_dict': {
+ 'id': 402,
+ 'ext': 'mp4',
+ 'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'is_live': True,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'skip': 'Georestricted to Czech Republic',
+ }, {
# video with 18+ caution trailer
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
'info_dict': {
@@ -102,7 +116,7 @@ class CeskaTelevizeIE(InfoExtractor):
req = sanitized_Request(
'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
- data=compat_urllib_parse.urlencode(data))
+ data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
req.add_header('x-addr', '127.0.0.1')
@@ -118,18 +132,21 @@ class CeskaTelevizeIE(InfoExtractor):
req = sanitized_Request(compat_urllib_parse_unquote(playlist_url))
req.add_header('Referer', url)
- playlist_title = self._og_search_title(webpage)
- playlist_description = self._og_search_description(webpage)
+ playlist_title = self._og_search_title(webpage, default=None)
+ playlist_description = self._og_search_description(webpage, default=None)
playlist = self._download_json(req, playlist_id)['playlist']
playlist_len = len(playlist)
entries = []
for item in playlist:
+ is_live = item.get('type') == 'LIVE'
formats = []
for format_id, stream_url in item['streamUrls'].items():
formats.extend(self._extract_m3u8_formats(
- stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native'))
+ stream_url, playlist_id, 'mp4',
+ entry_protocol='m3u8' if is_live else 'm3u8_native',
+ fatal=False))
self._sort_formats(formats)
item_id = item.get('id') or item['assetId']
@@ -144,14 +161,22 @@ class CeskaTelevizeIE(InfoExtractor):
if subs:
subtitles = self.extract_subtitles(episode_id, subs)
+ if playlist_len == 1:
+ final_title = playlist_title or title
+ if is_live:
+ final_title = self._live_title(final_title)
+ else:
+ final_title = '%s (%s)' % (playlist_title, title)
+
entries.append({
'id': item_id,
- 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title),
+ 'title': final_title,
'description': playlist_description if playlist_len == 1 else None,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
'subtitles': subtitles,
+ 'is_live': is_live,
})
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index c74553d..34d4e61 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -20,54 +20,64 @@ class Channel9IE(InfoExtractor):
'''
IE_DESC = 'Channel 9'
IE_NAME = 'channel9'
- _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+)/?'
-
- _TESTS = [
- {
- 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
- 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
- 'info_dict': {
- 'id': 'Events/TechEd/Australia/2013/KOS002',
- 'ext': 'mp4',
- 'title': 'Developer Kick-Off Session: Stuff We Love',
- 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
- 'duration': 4576,
- 'thumbnail': 're:http://.*\.jpg',
- 'session_code': 'KOS002',
- 'session_day': 'Day 1',
- 'session_room': 'Arena 1A',
- 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', 'Mads Kristensen'],
- },
+ _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
+
+ _TESTS = [{
+ 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
+ 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
+ 'info_dict': {
+ 'id': 'Events/TechEd/Australia/2013/KOS002',
+ 'ext': 'mp4',
+ 'title': 'Developer Kick-Off Session: Stuff We Love',
+ 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
+ 'duration': 4576,
+ 'thumbnail': 're:http://.*\.jpg',
+ 'session_code': 'KOS002',
+ 'session_day': 'Day 1',
+ 'session_room': 'Arena 1A',
+ 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
+ 'Mads Kristensen'],
},
- {
- 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
- 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
- 'info_dict': {
- 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
- 'ext': 'mp4',
- 'title': 'Self-service BI with Power BI - nuclear testing',
- 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
- 'duration': 1540,
- 'thumbnail': 're:http://.*\.jpg',
- 'authors': ['Mike Wilmot'],
- },
+ }, {
+ 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
+ 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
+ 'info_dict': {
+ 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
+ 'ext': 'mp4',
+ 'title': 'Self-service BI with Power BI - nuclear testing',
+ 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
+ 'duration': 1540,
+ 'thumbnail': 're:http://.*\.jpg',
+ 'authors': ['Mike Wilmot'],
},
- {
- # low quality mp4 is best
- 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
- 'info_dict': {
- 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
- 'ext': 'mp4',
- 'title': 'Ranges for the Standard Library',
- 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
- 'duration': 5646,
- 'thumbnail': 're:http://.*\.jpg',
- },
- 'params': {
- 'skip_download': True,
- },
- }
- ]
+ }, {
+ # low quality mp4 is best
+ 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'info_dict': {
+ 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
+ 'ext': 'mp4',
+ 'title': 'Ranges for the Standard Library',
+ 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
+ 'duration': 5646,
+ 'thumbnail': 're:http://.*\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
+ 'info_dict': {
+ 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
+ 'title': 'Channel 9',
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
+ 'only_matching': True,
+ }]
_RSS_URL = 'http://channel9.msdn.com/%s/RSS'
@@ -254,22 +264,30 @@ class Channel9IE(InfoExtractor):
return self.playlist_result(contents)
- def _extract_list(self, content_path):
- rss = self._download_xml(self._RSS_URL % content_path, content_path, 'Downloading RSS')
+ def _extract_list(self, video_id, rss_url=None):
+ if not rss_url:
+ rss_url = self._RSS_URL % video_id
+ rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
entries = [self.url_result(session_url.text, 'Channel9')
for session_url in rss.findall('./channel/item/link')]
title_text = rss.find('./channel/title').text
- return self.playlist_result(entries, content_path, title_text)
+ return self.playlist_result(entries, video_id, title_text)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
content_path = mobj.group('contentpath')
+ rss = mobj.group('rss')
+
+ if rss:
+ return self._extract_list(content_path, url)
- webpage = self._download_webpage(url, content_path, 'Downloading web page')
+ webpage = self._download_webpage(
+ url, content_path, 'Downloading web page')
- page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage)
- if page_type_m is not None:
- page_type = page_type_m.group('pagetype')
+ page_type = self._search_regex(
+ r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
+ webpage, 'page type', default=None, group='pagetype')
+ if page_type:
if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
return self._extract_entry_item(webpage, content_path)
elif page_type == 'Session': # Event session page, may contain downloadable content
@@ -278,6 +296,5 @@ class Channel9IE(InfoExtractor):
return self._extract_list(content_path)
else:
raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
-
else: # Assuming list
return self._extract_list(content_path)
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py
index 242fba3..b223454 100644
--- a/youtube_dl/extractor/chaturbate.py
+++ b/youtube_dl/extractor/chaturbate.py
@@ -48,6 +48,7 @@ class ChaturbateIE(InfoExtractor):
raise ExtractorError('Unable to find stream URL')
formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py
deleted file mode 100644
index 6d9cd8a..0000000
--- a/youtube_dl/extractor/cinemassacre.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import ExtractorError
-from .screenwavemedia import ScreenwaveMediaIE
-
-
-class CinemassacreIE(InfoExtractor):
- _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
- _TESTS = [
- {
- 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
- 'md5': 'fde81fbafaee331785f58cd6c0d46190',
- 'info_dict': {
- 'id': 'Cinemassacre-19911',
- 'ext': 'mp4',
- 'upload_date': '20121110',
- 'title': '“Angry Video Game Nerd: The Movie” – Trailer',
- 'description': 'md5:fb87405fcb42a331742a0dce2708560b',
- },
- },
- {
- 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
- 'md5': 'd72f10cd39eac4215048f62ab477a511',
- 'info_dict': {
- 'id': 'Cinemassacre-521be8ef82b16',
- 'ext': 'mp4',
- 'upload_date': '20131002',
- 'title': 'The Mummy’s Hand (1940)',
- },
- },
- {
- # Youtube embedded video
- 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/',
- 'md5': 'df4cf8a1dcedaec79a73d96d83b99023',
- 'info_dict': {
- 'id': 'OEVzPCY2T-g',
- 'ext': 'mp4',
- 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',
- 'upload_date': '20061207',
- 'uploader': 'Cinemassacre',
- 'uploader_id': 'JamesNintendoNerd',
- 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6',
- }
- },
- {
- # Youtube embedded video
- 'url': 'http://cinemassacre.com/2006/09/01/mckids/',
- 'md5': '6eb30961fa795fedc750eac4881ad2e1',
- 'info_dict': {
- 'id': 'FnxsNhuikpo',
- 'ext': 'mp4',
- 'upload_date': '20060901',
- 'uploader': 'Cinemassacre Extras',
- 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',
- 'uploader_id': 'Cinemassacre',
- 'title': 'AVGN: McKids',
- }
- },
- {
- 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/',
- 'md5': '1376908e49572389e7b06251a53cdd08',
- 'info_dict': {
- 'id': 'Cinemassacre-555779690c440',
- 'ext': 'mp4',
- 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',
- 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',
- 'upload_date': '20150525',
- }
- }
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
- video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d')
-
- webpage = self._download_webpage(url, display_id)
-
- playerdata_url = self._search_regex(
- [
- ScreenwaveMediaIE.EMBED_PATTERN,
- r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
- ],
- webpage, 'player data URL', default=None, group='url')
- if not playerdata_url:
- raise ExtractorError('Unable to find player data')
-
- video_title = self._html_search_regex(
- r'<title>(?P<title>.+?)\|', webpage, 'title')
- video_description = self._html_search_regex(
- r'<div class="entry-content">(?P<description>.+?)</div>',
- webpage, 'description', flags=re.DOTALL, fatal=False)
- video_thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- '_type': 'url_transparent',
- 'display_id': display_id,
- 'title': video_title,
- 'description': video_description,
- 'upload_date': video_date,
- 'thumbnail': video_thumbnail,
- 'url': playerdata_url,
- }
diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py
index 2996b6b..19f8b39 100644
--- a/youtube_dl/extractor/cliphunter.py
+++ b/youtube_dl/extractor/cliphunter.py
@@ -19,7 +19,7 @@ def _decode(s):
class CliphunterIE(InfoExtractor):
IE_NAME = 'cliphunter'
- _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/
+ _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/
(?P<id>[0-9]+)/
(?P<seo>.+?)(?:$|[#\?])
'''
diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py
new file mode 100644
index 0000000..4f9320e
--- /dev/null
+++ b/youtube_dl/extractor/cliprs.py
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class ClipRsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
+ _TEST = {
+ 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
+ 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5',
+ 'info_dict': {
+ 'id': '1488842.1399140381',
+ 'ext': 'mp4',
+ 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli',
+ 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026',
+ 'duration': 229,
+ 'timestamp': 1459850243,
+ 'upload_date': '20160405',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+
+ response = self._download_json(
+ 'http://qi.ckm.onetapi.pl/', video_id,
+ query={
+ 'body[id]': video_id,
+ 'body[jsonrpc]': '2.0',
+ 'body[method]': 'get_asset_detail',
+ 'body[params][ID_Publikacji]': video_id,
+ 'body[params][Service]': 'www.onet.pl',
+ 'content-type': 'application/jsonp',
+ 'x-onet-app': 'player.front.onetapi.pl',
+ })
+
+ error = response.get('error')
+ if error:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
+
+ video = response['result'].get('0')
+
+ formats = []
+ for _, formats_dict in video['formats'].items():
+ if not isinstance(formats_dict, dict):
+ continue
+ for format_id, format_list in formats_dict.items():
+ if not isinstance(format_list, list):
+ continue
+ for f in format_list:
+ if not f.get('url'):
+ continue
+ formats.append({
+ 'url': f['url'],
+ 'format_id': format_id,
+ 'height': int_or_none(f.get('vertical_resolution')),
+ 'width': int_or_none(f.get('horizontal_resolution')),
+ 'abr': float_or_none(f.get('audio_bitrate')),
+ 'vbr': float_or_none(f.get('video_bitrate')),
+ })
+ self._sort_formats(formats)
+
+ meta = video.get('meta', {})
+
+ title = self._og_search_title(webpage, default=None) or meta['title']
+ description = self._og_search_description(webpage, default=None) or meta.get('description')
+ duration = meta.get('length') or meta.get('lenght')
+ timestamp = parse_iso8601(meta.get('addDate'), ' ')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py
index 8306d6f..0b6ad89 100644
--- a/youtube_dl/extractor/clipsyndicate.py
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -8,7 +8,7 @@ from ..utils import (
class ClipsyndicateIE(InfoExtractor):
- _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py
new file mode 100644
index 0000000..26243d5
--- /dev/null
+++ b/youtube_dl/extractor/closertotruth.py
@@ -0,0 +1,92 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class CloserToTruthIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
+ 'info_dict': {
+ 'id': '0_zof1ktre',
+ 'display_id': 'solutions-the-mind-body-problem',
+ 'ext': 'mov',
+ 'title': 'Solutions to the Mind-Body Problem?',
+ 'upload_date': '20140221',
+ 'timestamp': 1392956007,
+ 'uploader_id': 'CTTXML'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://closertotruth.com/episodes/how-do-brains-work',
+ 'info_dict': {
+ 'id': '0_iuxai6g6',
+ 'display_id': 'how-do-brains-work',
+ 'ext': 'mov',
+ 'title': 'How do Brains Work?',
+ 'upload_date': '20140221',
+ 'timestamp': 1392956024,
+ 'uploader_id': 'CTTXML'
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://closertotruth.com/interviews/1725',
+ 'info_dict': {
+ 'id': '1725',
+ 'title': 'AyaFr-002',
+ },
+ 'playlist_mincount': 2,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ partner_id = self._search_regex(
+ r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
+ webpage, 'kaltura partner_id')
+
+ title = self._search_regex(
+ r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')
+
+ select = self._search_regex(
+ r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
+ webpage, 'select version', default=None)
+ if select:
+ entry_ids = set()
+ entries = []
+ for mobj in re.finditer(
+ r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
+ webpage):
+ entry_id = mobj.group('id')
+ if entry_id in entry_ids:
+ continue
+ entry_ids.add(entry_id)
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+ 'ie_key': 'Kaltura',
+ 'title': mobj.group('title'),
+ })
+ if entries:
+ return self.playlist_result(entries, display_id, title)
+
+ entry_id = self._search_regex(
+ r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
+ webpage, 'kaltura entry_id', group='id')
+
+ return {
+ '_type': 'url_transparent',
+ 'display_id': display_id,
+ 'url': 'kaltura:%s:%s' % (partner_id, entry_id),
+ 'ie_key': 'Kaltura',
+ 'title': title
+ }
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index 0fa720e..9a28ef3 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -6,7 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_HTTPError,
)
from ..utils import (
@@ -19,7 +19,7 @@ from ..utils import (
class CloudyIE(InfoExtractor):
_IE_DESC = 'cloudy.ec and videoraj.ch'
_VALID_URL = r'''(?x)
- https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/
+ https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/
(?:v/|embed\.php\?id=)
(?P<id>[A-Za-z0-9]+)
'''
@@ -37,7 +37,7 @@ class CloudyIE(InfoExtractor):
}
},
{
- 'url': 'http://www.videoraj.ch/v/47f399fd8bb60',
+ 'url': 'http://www.videoraj.to/v/47f399fd8bb60',
'md5': '7d0f8799d91efd4eda26587421c3c3b0',
'info_dict': {
'id': '47f399fd8bb60',
@@ -64,7 +64,7 @@ class CloudyIE(InfoExtractor):
'errorUrl': error_url,
})
- data_url = self._API_URL % (video_host, compat_urllib_parse.urlencode(form))
+ data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form))
player_data = self._download_webpage(
data_url, video_id, 'Downloading player data')
data = compat_parse_qs(player_data)
diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py
index 1dfa7c1..2fba935 100644
--- a/youtube_dl/extractor/clubic.py
+++ b/youtube_dl/extractor/clubic.py
@@ -12,7 +12,7 @@ from ..utils import (
class ClubicIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html'
_TESTS = [{
'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',
diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py
new file mode 100644
index 0000000..d354d9f
--- /dev/null
+++ b/youtube_dl/extractor/cnbc.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import smuggle_url
+
+
+class CNBCIE(InfoExtractor):
+ _VALID_URL = r'https?://video\.cnbc\.com/gallery/\?video=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://video.cnbc.com/gallery/?video=3000503714',
+ 'info_dict': {
+ 'id': '3000503714',
+ 'ext': 'mp4',
+ 'title': 'Fighting zombies is big business',
+ 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e',
+ 'timestamp': 1459332000,
+ 'upload_date': '20160330',
+ 'uploader': 'NBCU-CNBC',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(
+ 'http://link.theplatform.com/s/gZWlPC/media/guid/2408950221/%s?mbr=true&manifest=m3u' % video_id,
+ {'force_smil_url': True}),
+ 'id': video_id,
+ }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py
deleted file mode 100644
index 002b240..0000000
--- a/youtube_dl/extractor/collegehumor.py
+++ /dev/null
@@ -1,101 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-import re
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class CollegeHumorIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
-
- _TESTS = [
- {
- 'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
- 'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
- 'info_dict': {
- 'id': '6902724',
- 'ext': 'mp4',
- 'title': 'Comic-Con Cosplay Catastrophe',
- 'description': "Fans get creative this year at San Diego. Too creative. And yes, that's really Joss Whedon.",
- 'age_limit': 13,
- 'duration': 187,
- },
- }, {
- 'url': 'http://www.collegehumor.com/video/3505939/font-conference',
- 'md5': '72fa701d8ef38664a4dbb9e2ab721816',
- 'info_dict': {
- 'id': '3505939',
- 'ext': 'mp4',
- 'title': 'Font Conference',
- 'description': "This video wasn't long enough, so we made it double-spaced.",
- 'age_limit': 10,
- 'duration': 179,
- },
- }, {
- # embedded youtube video
- 'url': 'http://www.collegehumor.com/embed/6950306',
- 'info_dict': {
- 'id': 'Z-bao9fg6Yc',
- 'ext': 'mp4',
- 'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
- 'uploader': 'Mark Dice',
- 'uploader_id': 'MarkDice',
- 'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
- 'upload_date': '20140127',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Youtube'],
- },
- ]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('videoid')
-
- jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json'
- data = json.loads(self._download_webpage(
- jsonUrl, video_id, 'Downloading info JSON'))
- vdata = data['video']
- if vdata.get('youtubeId') is not None:
- return {
- '_type': 'url',
- 'url': vdata['youtubeId'],
- 'ie_key': 'Youtube',
- }
-
- AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
- rating = vdata.get('rating')
- if rating:
- age_limit = AGE_LIMITS.get(rating.lower())
- else:
- age_limit = None # None = No idea
-
- PREFS = {'high_quality': 2, 'low_quality': 0}
- formats = []
- for format_key in ('mp4', 'webm'):
- for qname, qurl in vdata.get(format_key, {}).items():
- formats.append({
- 'format_id': format_key + '_' + qname,
- 'url': qurl,
- 'format': format_key,
- 'preference': PREFS.get(qname),
- })
- self._sort_formats(formats)
-
- duration = int_or_none(vdata.get('duration'), 1000)
- like_count = int_or_none(vdata.get('likes'))
-
- return {
- 'id': video_id,
- 'title': vdata['title'],
- 'description': vdata.get('description'),
- 'thumbnail': vdata.get('thumbnail'),
- 'formats': formats,
- 'age_limit': age_limit,
- 'duration': duration,
- 'like_count': like_count,
- }
diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py
index 7dff684..747c245 100644
--- a/youtube_dl/extractor/comcarcoff.py
+++ b/youtube_dl/extractor/comcarcoff.py
@@ -11,7 +11,7 @@ from ..utils import (
class ComCarCoffIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)'
+ _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)'
_TESTS = [{
'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/',
'info_dict': {
@@ -41,7 +41,13 @@ class ComCarCoffIE(InfoExtractor):
display_id = full_data['activeVideo']['video']
video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id]
+
video_id = compat_str(video_data['mediaId'])
+ title = video_data['title']
+ formats = self._extract_m3u8_formats(
+ video_data['mediaUrl'], video_id, 'mp4')
+ self._sort_formats(formats)
+
thumbnails = [{
'url': video_data['images']['thumb'],
}, {
@@ -54,15 +60,14 @@ class ComCarCoffIE(InfoExtractor):
video_data.get('duration'))
return {
- '_type': 'url_transparent',
- 'url': 'crackle:%s' % video_id,
'id': video_id,
'display_id': display_id,
- 'title': video_data['title'],
+ 'title': title,
'description': video_data.get('description'),
'timestamp': timestamp,
'duration': duration,
'thumbnails': thumbnails,
+ 'formats': formats,
'season_number': int_or_none(video_data.get('season')),
'episode_number': int_or_none(video_data.get('episode')),
'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))),
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index 5b1b996..2b6aaa3 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -5,7 +5,7 @@ import re
from .mtv import MTVServicesInfoExtractor
from ..compat import (
compat_str,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
@@ -44,10 +44,10 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
_VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)
|https?://(:www\.)?
- (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
+ (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/
((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
(?P<clip>
- (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
+ (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
|(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
)|
@@ -129,6 +129,9 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
}, {
'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel',
'only_matching': True,
+ }, {
+ 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
+ 'only_matching': True,
}]
_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
@@ -201,7 +204,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
# Correct cc.com in uri
uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri)
- index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
+ index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri}))
idoc = self._download_xml(
index_url, epTitle,
'Downloading show index', 'Unable to download episode index')
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 14f5756..5a2603b 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -15,14 +15,17 @@ import math
from ..compat import (
compat_cookiejar,
compat_cookies,
+ compat_etree_fromstring,
compat_getpass,
compat_http_client,
+ compat_os_name,
+ compat_str,
compat_urllib_error,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
+ compat_urllib_request,
compat_urlparse,
- compat_str,
- compat_etree_fromstring,
)
+from ..downloader.f4m import remove_encrypted_media
from ..utils import (
NO_DEFAULT,
age_restricted,
@@ -42,11 +45,15 @@ from ..utils import (
unescapeHTML,
unified_strdate,
url_basename,
+ xpath_element,
xpath_text,
xpath_with_ns,
determine_protocol,
parse_duration,
mimetype2ext,
+ update_Request,
+ update_url_query,
+ parse_m3u8_attributes,
)
@@ -104,7 +111,7 @@ class InfoExtractor(object):
* protocol The protocol that will be used for the actual
download, lower-case.
"http", "https", "rtsp", "rtmp", "rtmpe",
- "m3u8", or "m3u8_native".
+ "m3u8", "m3u8_native" or "http_dash_segments".
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@@ -157,12 +164,14 @@ class InfoExtractor(object):
thumbnail: Full URL to a video thumbnail image.
description: Full video description.
uploader: Full name of the video uploader.
- creator: The main artist who created the video.
+ license: License name the video is licensed under.
+ creator: The creator of the video.
release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video became available.
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
+ uploader_url: Full URL to a personal webpage of the video uploader.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{language: subformats}. "subformats" is a list sorted from
@@ -225,6 +234,24 @@ class InfoExtractor(object):
episode_number: Number of the video episode within a season, as an integer.
episode_id: Id of the video episode, as a unicode string.
+ The following fields should only be used when the media is a track or a part of
+ a music album:
+
+ track: Title of the track.
+ track_number: Number of the track within an album or a disc, as an integer.
+ track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
+ as a unicode string.
+ artist: Artist(s) of the track.
+ genre: Genre(s) of the track.
+ album: Title of the album the track belongs to.
+ album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
+ album_artist: List of all artists appeared on the album (e.g.
+ "Ash Borer / Fell Voices" or "Various Artists", useful for splits
+ and compilations).
+ disc_number: Number of the disc or other physical medium the track belongs to,
+ as an integer.
+ release_year: Year (YYYY) when the album was released.
+
Unless mentioned otherwise, the fields should be Unicode strings.
Unless mentioned otherwise, None is equivalent to absence of information.
@@ -342,7 +369,7 @@ class InfoExtractor(object):
def IE_NAME(self):
return compat_str(type(self).__name__[:-2])
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
""" Returns the response handle """
if note is None:
self.report_download_webpage(video_id)
@@ -351,6 +378,14 @@ class InfoExtractor(object):
self.to_screen('%s' % (note,))
else:
self.to_screen('%s: %s' % (video_id, note))
+ if isinstance(url_or_request, compat_urllib_request.Request):
+ url_or_request = update_Request(
+ url_or_request, data=data, headers=headers, query=query)
+ else:
+ if query:
+ url_or_request = update_url_query(url_or_request, query)
+ if data is not None or headers:
+ url_or_request = sanitized_Request(url_or_request, data, headers)
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -366,13 +401,13 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
""" Returns a tuple (page content as string, URL handle) """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
if urlh is False:
assert not fatal
return False
@@ -425,7 +460,7 @@ class InfoExtractor(object):
self.to_screen('Saving request to ' + filename)
# Working around MAX_PATH limitation on Windows (see
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
- if os.name == 'nt':
+ if compat_os_name == 'nt':
absfilepath = os.path.abspath(filename)
if len(absfilepath) > 259:
filename = '\\\\?\\' + absfilepath
@@ -459,13 +494,13 @@ class InfoExtractor(object):
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
""" Returns the data of the page as a string """
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -480,10 +515,10 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None):
+ transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
+ url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
if xml_string is False:
return xml_string
if transform_source:
@@ -494,10 +529,10 @@ class InfoExtractor(object):
note='Downloading JSON metadata',
errnote='Unable to download JSON metadata',
transform_source=None,
- fatal=True, encoding=None):
+ fatal=True, encoding=None, data=None, headers={}, query={}):
json_string = self._download_webpage(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding)
+ encoding=encoding, data=data, headers=headers, query=query)
if (not fatal) and json_string is False:
return None
return self._parse_json(
@@ -594,7 +629,7 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
+ if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -809,7 +844,7 @@ class InfoExtractor(object):
for input in re.findall(r'(?i)<input([^>]+)>', html):
if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
continue
- name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+ name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
if not name:
continue
value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
@@ -852,6 +887,7 @@ class InfoExtractor(object):
proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
if f.get('vcodec') == 'none': # audio only
+ preference -= 50
if self._downloader.params.get('prefer_free_formats'):
ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
else:
@@ -862,6 +898,8 @@ class InfoExtractor(object):
except ValueError:
audio_ext_preference = -1
else:
+ if f.get('acodec') == 'none': # video only
+ preference -= 40
if self._downloader.params.get('prefer_free_formats'):
ORDER = ['flv', 'mp4', 'webm']
else:
@@ -951,7 +989,7 @@ class InfoExtractor(object):
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
- fatal=True):
+ fatal=True, m3u8_id=None):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
@@ -963,20 +1001,56 @@ class InfoExtractor(object):
if manifest is False:
return []
+ return self._parse_f4m_formats(
+ manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
+
+ def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip(),
+ fatal=True, m3u8_id=None):
+ # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
+ akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
+ if akamai_pv is not None and ';' in akamai_pv.text:
+ playerVerificationChallenge = akamai_pv.text.split(';')[0]
+ if playerVerificationChallenge.strip() != '':
+ return []
+
formats = []
manifest_version = '1.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
if not media_nodes:
manifest_version = '2.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+ # Remove unsupported DRM protected media from final formats
+ # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
+ media_nodes = remove_encrypted_media(media_nodes)
+ if not media_nodes:
+ return formats
base_url = xpath_text(
manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
'base URL', default=None)
if base_url:
base_url = base_url.strip()
+
+ bootstrap_info = xpath_element(
+ manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
+ 'bootstrap info', default=None)
+
for i, media_el in enumerate(media_nodes):
- if manifest_version == '2.0':
- media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ tbr = int_or_none(media_el.attrib.get('bitrate'))
+ width = int_or_none(media_el.attrib.get('width'))
+ height = int_or_none(media_el.attrib.get('height'))
+ format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+ # If <bootstrapInfo> is present, the specified f4m is a
+ # stream-level manifest, and only set-level manifests may refer to
+ # external resources. See section 11.4 and section 4 of F4M spec
+ if bootstrap_info is None:
+ media_url = None
+ # @href is introduced in 2.0, see section 11.6 of F4M spec
+ if manifest_version == '2.0':
+ media_url = media_el.attrib.get('href')
+ if media_url is None:
+ media_url = media_el.attrib.get('url')
if not media_url:
continue
manifest_url = (
@@ -986,30 +1060,43 @@ class InfoExtractor(object):
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
# bitrate in f4m downloader
- if determine_ext(manifest_url) == 'f4m':
- formats.extend(self._extract_f4m_formats(
- manifest_url, video_id, preference, f4m_id, fatal=fatal))
+ ext = determine_ext(manifest_url)
+ if ext == 'f4m':
+ f4m_formats = self._extract_f4m_formats(
+ manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+ transform_source=transform_source, fatal=fatal)
+ # Sometimes stream-level manifest contains single media entry that
+ # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
+ # At the same time parent's media entry in set-level manifest may
+ # contain it. We will copy it from parent in such cases.
+ if len(f4m_formats) == 1:
+ f = f4m_formats[0]
+ f.update({
+ 'tbr': f.get('tbr') or tbr,
+ 'width': f.get('width') or width,
+ 'height': f.get('height') or height,
+ 'format_id': f.get('format_id') if not tbr else format_id,
+ })
+ formats.extend(f4m_formats)
+ continue
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', preference=preference,
+ m3u8_id=m3u8_id, fatal=fatal))
continue
- tbr = int_or_none(media_el.attrib.get('bitrate'))
formats.append({
- 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
+ 'format_id': format_id,
'url': manifest_url,
- 'ext': 'flv',
+ 'ext': 'flv' if bootstrap_info is not None else None,
'tbr': tbr,
- 'width': int_or_none(media_el.attrib.get('width')),
- 'height': int_or_none(media_el.attrib.get('height')),
+ 'width': width,
+ 'height': height,
'preference': preference,
})
- self._sort_formats(formats)
-
return formats
- def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
- entry_protocol='m3u8', preference=None,
- m3u8_id=None, note=None, errnote=None,
- fatal=True):
-
- formats = [{
+ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
+ return {
'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
'url': m3u8_url,
'ext': ext,
@@ -1017,7 +1104,14 @@ class InfoExtractor(object):
'preference': preference - 1 if preference else -1,
'resolution': 'multiple',
'format_note': 'Quality selection URL',
- }]
+ }
+
+ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True, live=False):
+
+ formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
format_url = lambda u: (
u
@@ -1033,11 +1127,21 @@ class InfoExtractor(object):
return []
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
- # A Media Playlist Tag MUST NOT appear in a Master Playlist
- # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
- # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
- if '#EXT-X-TARGETDURATION' in m3u8_doc:
+
+ # We should try extracting formats only from master playlists [1], i.e.
+ # playlists that describe available qualities. On the other hand media
+ # playlists [2] should be returned as is since they contain just the media
+ # without qualities renditions.
+ # Fortunately, master playlist can be easily distinguished from media
+ # playlist based on particular tags availability. As of [1, 2] master
+ # playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
+ # and MUST NOT appear in master playlist thus we can clearly detect media
+ # playlist with this criterion.
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
+ # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
+ # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
'format_id': m3u8_id,
@@ -1047,23 +1151,11 @@ class InfoExtractor(object):
}]
last_info = None
last_media = None
- kv_rex = re.compile(
- r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = {}
- for m in kv_rex.finditer(line):
- v = m.group('val')
- if v.startswith('"'):
- v = v[1:-1]
- last_info[m.group('key')] = v
+ last_info = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
- last_media = {}
- for m in kv_rex.finditer(line):
- v = m.group('val')
- if v.startswith('"'):
- v = v[1:-1]
- last_media[m.group('key')] = v
+ last_media = parse_m3u8_attributes(line)
elif line.startswith('#') or not line.strip():
continue
else:
@@ -1074,8 +1166,15 @@ class InfoExtractor(object):
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
- format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
+ last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF it still sometimes may be present
+ stream_name = last_info.get('NAME') or last_media_name
+ # Bandwidth of live streams may differ over time thus making
+ # format_id unpredictable. So it's better to keep provided
+ # format_id intact.
+ if not live:
+ format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
f = {
'format_id': '-'.join(format_id),
'url': format_url(line.strip()),
@@ -1084,25 +1183,34 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}
- codecs = last_info.get('CODECS')
- if codecs:
- # TODO: looks like video codec is not always necessarily goes first
- va_codecs = codecs.split(',')
- if va_codecs[0]:
- f['vcodec'] = va_codecs[0]
- if len(va_codecs) > 1 and va_codecs[1]:
- f['acodec'] = va_codecs[1]
resolution = last_info.get('RESOLUTION')
if resolution:
width_str, height_str = resolution.split('x')
f['width'] = int(width_str)
f['height'] = int(height_str)
+ codecs = last_info.get('CODECS')
+ if codecs:
+ vcodec, acodec = [None] * 2
+ va_codecs = codecs.split(',')
+ if len(va_codecs) == 1:
+ # Audio only entries usually come with single codec and
+ # no resolution. For more robustness we also check it to
+ # be mp4 audio.
+ if not resolution and va_codecs[0].startswith('mp4a'):
+ vcodec, acodec = 'none', va_codecs[0]
+ else:
+ vcodec = va_codecs[0]
+ else:
+ vcodec, acodec = va_codecs[:2]
+ f.update({
+ 'acodec': acodec,
+ 'vcodec': vcodec,
+ })
if last_media is not None:
f['m3u8_media'] = last_media
last_media = None
formats.append(f)
last_info = {}
- self._sort_formats(formats)
return formats
@staticmethod
@@ -1117,8 +1225,8 @@ class InfoExtractor(object):
out.append('{%s}%s' % (namespace, c))
return '/'.join(out)
- def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
- smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if smil is False:
assert not fatal
@@ -1135,10 +1243,10 @@ class InfoExtractor(object):
return {}
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
- def _download_smil(self, smil_url, video_id, fatal=True):
+ def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
return self._download_xml(
smil_url, video_id, 'Downloading SMIL file',
- 'Unable to download SMIL file', fatal=fatal)
+ 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
@@ -1198,21 +1306,21 @@ class InfoExtractor(object):
m3u8_count = 0
srcs = []
- videos = smil.findall(self._xpath_ns('.//video', namespace))
- for video in videos:
- src = video.get('src')
+ media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
+ for medium in media:
+ src = medium.get('src')
if not src or src in srcs:
continue
srcs.append(src)
- bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
- filesize = int_or_none(video.get('size') or video.get('fileSize'))
- width = int_or_none(video.get('width'))
- height = int_or_none(video.get('height'))
- proto = video.get('proto')
- ext = video.get('ext')
+ bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
+ filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
+ width = int_or_none(medium.get('width'))
+ height = int_or_none(medium.get('height'))
+ proto = medium.get('proto')
+ ext = medium.get('ext')
src_ext = determine_ext(src)
- streamer = video.get('streamer') or base
+ streamer = medium.get('streamer') or base
if proto == 'rtmp' or streamer.startswith('rtmp'):
rtmp_count += 1
@@ -1259,7 +1367,7 @@ class InfoExtractor(object):
'plugin': 'flowplayer-3.2.0.1',
}
f4m_url += '&' if '?' in f4m_url else '?'
- f4m_url += compat_urllib_parse.urlencode(f4m_params)
+ f4m_url += compat_urllib_parse_urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
continue
@@ -1276,8 +1384,6 @@ class InfoExtractor(object):
})
continue
- self._sort_formats(formats)
-
return formats
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
@@ -1288,7 +1394,7 @@ class InfoExtractor(object):
if not src or src in urls:
continue
urls.append(src)
- ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
+ ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
subtitles.setdefault(lang, []).append({
'url': src,
@@ -1424,8 +1530,9 @@ class InfoExtractor(object):
continue
representation_attrib = adaptation_set.attrib.copy()
representation_attrib.update(representation.attrib)
- mime_type = representation_attrib.get('mimeType')
- content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
+ # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+ mime_type = representation_attrib['mimeType']
+ content_type = mime_type.split('/')[0]
if content_type == 'text':
# TODO implement WebVTT downloading
pass
@@ -1448,6 +1555,7 @@ class InfoExtractor(object):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
'url': base_url,
+ 'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
@@ -1466,9 +1574,16 @@ class InfoExtractor(object):
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
media_template = representation_ms_info['media_template']
media_template = media_template.replace('$RepresentationID$', representation_id)
- media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
+ media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
+ media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
media_template.replace('$$', '$')
- representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+ representation_ms_info['segment_urls'] = [
+ media_template % {
+ 'Number': segment_number,
+ 'Bandwidth': representation_attrib.get('bandwidth')}
+ for segment_number in range(
+ representation_ms_info['start_number'],
+ representation_ms_info['total_number'] + representation_ms_info['start_number'])]
if 'segment_urls' in representation_ms_info:
f.update({
'segment_urls': representation_ms_info['segment_urls'],
@@ -1493,7 +1608,6 @@ class InfoExtractor(object):
existing_format.update(f)
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
- self._sort_formats(formats)
return formats
def _live_title(self, name):
@@ -1600,6 +1714,15 @@ class InfoExtractor(object):
def _get_automatic_captions(self, *args, **kwargs):
raise NotImplementedError('This method must be implemented by subclasses')
+ def mark_watched(self, *args, **kwargs):
+ if (self._downloader.params.get('mark_watched', False) and
+ (self._get_login_info()[0] is not None or
+ self._downloader.params.get('cookiefile') is not None)):
+ self._mark_watched(*args, **kwargs)
+
+ def _mark_watched(self, *args, **kwargs):
+ raise NotImplementedError('This method must be implemented by subclasses')
+
class SearchInfoExtractor(InfoExtractor):
"""
diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py
new file mode 100644
index 0000000..5d130a1
--- /dev/null
+++ b/youtube_dl/extractor/commonprotocols.py
@@ -0,0 +1,36 @@
+from __future__ import unicode_literals
+
+import os
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_urlparse,
+)
+from ..utils import url_basename
+
+
+class RtmpIE(InfoExtractor):
+ IE_DESC = False # Do not list
+ _VALID_URL = r'(?i)rtmp[est]?://.+'
+
+ _TESTS = [{
+ 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4',
+ 'only_matching': True,
+ }, {
+ 'url': 'rtmp://edge.live.hitbox.tv/live/dimak',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
+ title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': [{
+ 'url': url,
+ 'ext': 'flv',
+ 'format_id': compat_urlparse.urlparse(url).scheme,
+ }],
+ }
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index 6f92ae2..e8f2b5a 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urlparse,
)
@@ -45,7 +45,7 @@ class CondeNastIE(InfoExtractor):
'wmagazine': 'W Magazine',
}
- _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
@@ -97,7 +97,7 @@ class CondeNastIE(InfoExtractor):
video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id')
player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id')
target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target')
- data = compat_urllib_parse.urlencode({'videoId': video_id,
+ data = compat_urllib_parse_urlencode({'videoId': video_id,
'playerId': player_id,
'target': target,
})
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py
new file mode 100644
index 0000000..a901b8d
--- /dev/null
+++ b/youtube_dl/extractor/coub.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+ qualities,
+)
+
+
+class CoubIE(InfoExtractor):
+ _VALID_URL = r'(?:coub:|https?://(?:coub\.com/(?:view|embed|coubs)/|c-cdn\.coub\.com/fb-player\.swf\?.*\bcoub(?:ID|id)=))(?P<id>[\da-z]+)'
+
+ _TESTS = [{
+ 'url': 'http://coub.com/view/5u5n1',
+ 'info_dict': {
+ 'id': '5u5n1',
+ 'ext': 'mp4',
+ 'title': 'The Matrix Moonwalk',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 4.6,
+ 'timestamp': 1428527772,
+ 'upload_date': '20150408',
+ 'uploader': 'Артём Лоскутников',
+ 'uploader_id': 'artyom.loskutnikov',
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'http://c-cdn.coub.com/fb-player.swf?bot_type=vk&coubID=7w5a4',
+ 'only_matching': True,
+ }, {
+ 'url': 'coub:5u5n1',
+ 'only_matching': True,
+ }, {
+ # longer video id
+ 'url': 'http://coub.com/view/237d5l5h',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ coub = self._download_json(
+ 'http://coub.com/api/v2/coubs/%s.json' % video_id, video_id)
+
+ if coub.get('error'):
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, coub['error']), expected=True)
+
+ title = coub['title']
+
+ file_versions = coub['file_versions']
+
+ QUALITIES = ('low', 'med', 'high')
+
+ MOBILE = 'mobile'
+ IPHONE = 'iphone'
+ HTML5 = 'html5'
+
+ SOURCE_PREFERENCE = (MOBILE, IPHONE, HTML5)
+
+ quality_key = qualities(QUALITIES)
+ preference_key = qualities(SOURCE_PREFERENCE)
+
+ formats = []
+
+ for kind, items in file_versions.get(HTML5, {}).items():
+ if kind not in ('video', 'audio'):
+ continue
+ if not isinstance(items, dict):
+ continue
+ for quality, item in items.items():
+ if not isinstance(item, dict):
+ continue
+ item_url = item.get('url')
+ if not item_url:
+ continue
+ formats.append({
+ 'url': item_url,
+ 'format_id': '%s-%s-%s' % (HTML5, kind, quality),
+ 'filesize': int_or_none(item.get('size')),
+ 'vcodec': 'none' if kind == 'audio' else None,
+ 'quality': quality_key(quality),
+ 'preference': preference_key(HTML5),
+ })
+
+ iphone_url = file_versions.get(IPHONE, {}).get('url')
+ if iphone_url:
+ formats.append({
+ 'url': iphone_url,
+ 'format_id': IPHONE,
+ 'preference': preference_key(IPHONE),
+ })
+
+ mobile_url = file_versions.get(MOBILE, {}).get('audio_url')
+ if mobile_url:
+ formats.append({
+ 'url': mobile_url,
+ 'format_id': '%s-audio' % MOBILE,
+ 'preference': preference_key(MOBILE),
+ })
+
+ self._sort_formats(formats)
+
+ thumbnail = coub.get('picture')
+ duration = float_or_none(coub.get('duration'))
+ timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at'))
+ uploader = coub.get('channel', {}).get('title')
+ uploader_id = coub.get('channel', {}).get('permalink')
+
+ view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
+ like_count = int_or_none(coub.get('likes_count'))
+ repost_count = int_or_none(coub.get('recoubs_count'))
+ comment_count = int_or_none(coub.get('comments_count'))
+
+ age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
+ if age_restricted is not None:
+ age_limit = 18 if age_restricted is True else 0
+ else:
+ age_limit = None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'repost_count': repost_count,
+ 'comment_count': comment_count,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index c7032ff..90a6430 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -11,8 +11,7 @@ from math import pow, sqrt, floor
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
- compat_urllib_parse,
- compat_urllib_parse_unquote,
+ compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
)
@@ -27,6 +26,7 @@ from ..utils import (
unified_strdate,
urlencode_postdata,
xpath_text,
+ extract_attributes,
)
from ..aes import (
aes_cbc_decrypt,
@@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
+ def _download_webpage(self, url_or_request, *args, **kwargs):
request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
else sanitized_Request(url_or_request))
# Accept-Language must be set explicitly to accept any language to avoid issues
@@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor):
# Crunchyroll to not work in georestriction cases in some browsers that don't place
# the locale lang first in header. However allowing any language seems to workaround the issue.
request.add_header('Accept-Language', '*')
- return super(CrunchyrollBaseIE, self)._download_webpage(
- request, video_id, note, errnote, fatal, tries, timeout, encoding)
+ return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs)
@staticmethod
def _add_skip_wall(url):
@@ -79,7 +78,7 @@ class CrunchyrollBaseIE(InfoExtractor):
# See https://github.com/rg3/youtube-dl/issues/7202.
qs['skip_wall'] = ['1']
return compat_urlparse.urlunparse(
- parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+ parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
class CrunchyrollIE(CrunchyrollBaseIE):
@@ -307,28 +306,36 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage,
'video_uploader', fatal=False)
- playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
- playerdata_req = sanitized_Request(playerdata_url)
- playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
- playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
-
- stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
- video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
-
+ available_fmts = []
+ for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+ attrs = extract_attributes(a)
+ href = attrs.get('href')
+ if href and '/freetrial' in href:
+ continue
+ available_fmts.append(fmt)
+ if not available_fmts:
+ for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+ available_fmts = re.findall(p, webpage)
+ if available_fmts:
+ break
+ video_encode_ids = []
formats = []
- for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
+ for fmt in available_fmts:
stream_quality, stream_format = self._FORMAT_IDS[fmt]
video_format = fmt + 'p'
streamdata_req = sanitized_Request(
'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
- % (stream_id, stream_format, stream_quality),
- compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
+ % (video_id, stream_format, stream_quality),
+ compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8'))
streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
streamdata = self._download_xml(
streamdata_req, video_id,
note='Downloading media info for %s' % video_format)
stream_info = streamdata.find('./{default}preload/stream_info')
+ video_encode_id = xpath_text(stream_info, './video_encode_id')
+ if video_encode_id in video_encode_ids:
+ continue
+ video_encode_ids.append(video_encode_id)
video_url = xpath_text(stream_info, './host')
video_play_path = xpath_text(stream_info, './file')
if not video_url or not video_play_path:
@@ -360,6 +367,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'ext': 'flv',
})
formats.append(format_info)
+ self._sort_formats(formats)
+
+ metadata = self._download_xml(
+ 'http://www.crunchyroll.com/xml', video_id,
+ note='Downloading media info', query={
+ 'req': 'RpcApiVideoPlayer_GetMediaMetadata',
+ 'media_id': video_id,
+ })
subtitles = self.extract_subtitles(video_id, webpage)
@@ -367,9 +382,12 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'id': video_id,
'title': video_title,
'description': video_description,
- 'thumbnail': video_thumbnail,
+ 'thumbnail': xpath_text(metadata, 'episode_image_url'),
'uploader': video_uploader,
'upload_date': video_upload_date,
+ 'series': xpath_text(metadata, 'series_title'),
+ 'episode': xpath_text(metadata, 'episode_title'),
+ 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')),
'subtitles': subtitles,
'formats': formats,
}
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index b8b9d05..84b36f4 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -15,7 +15,7 @@ from .senateisvp import SenateISVPIE
class CSpanIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
+ _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
IE_DESC = 'C-SPAN'
_TESTS = [{
'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py
index 45049bf..1622fc8 100644
--- a/youtube_dl/extractor/ctsnews.py
+++ b/youtube_dl/extractor/ctsnews.py
@@ -8,7 +8,7 @@ from ..utils import parse_iso8601, ExtractorError
class CtsNewsIE(InfoExtractor):
IE_DESC = '華視新聞'
# https connection failed (Connection reset)
- _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
+ _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
_TESTS = [{
'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html',
'md5': 'a9875cb790252b08431186d741beaabe',
diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py
index 36af670..ebd14cb 100644
--- a/youtube_dl/extractor/cwtv.py
+++ b/youtube_dl/extractor/cwtv.py
@@ -9,7 +9,7 @@ from ..utils import (
class CWTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
+ _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
_TESTS = [{
'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
'info_dict': {
@@ -48,6 +48,9 @@ class CWTVIE(InfoExtractor):
# m3u8 download
'skip_download': True,
}
+ }, {
+ 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -57,6 +60,7 @@ class CWTVIE(InfoExtractor):
formats = self._extract_m3u8_formats(
video_data['videos']['variantplaylist']['uri'], video_id, 'mp4')
+ self._sort_formats(formats)
thumbnails = [{
'url': image['uri'],
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
new file mode 100644
index 0000000..b60a1d8
--- /dev/null
+++ b/youtube_dl/extractor/dailymail.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ determine_protocol,
+)
+
+
+class DailyMailIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
+ 'md5': '2f639d446394f53f3a33658b518b6615',
+ 'info_dict': {
+ 'id': '1288527',
+ 'ext': 'mp4',
+ 'title': 'Turn any video into an impressionist masterpiece',
+ 'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video_data = self._parse_json(self._search_regex(
+ r"data-opts='({.+?})'", webpage, 'video data'), video_id)
+ title = video_data['title']
+ video_sources = self._download_json(video_data.get(
+ 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+ formats = []
+ for rendition in video_sources['renditions']:
+ rendition_url = rendition.get('url')
+ if not rendition_url:
+ continue
+ tbr = int_or_none(rendition.get('encodingRate'), 1000)
+ container = rendition.get('videoContainer')
+ is_hls = container == 'M2TS'
+ protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
+ formats.append({
+ 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
+ 'url': rendition_url,
+ 'width': int_or_none(rendition.get('frameWidth')),
+ 'height': int_or_none(rendition.get('frameHeight')),
+ 'tbr': tbr,
+ 'vcodec': rendition.get('videoCodec'),
+ 'container': container,
+ 'protocol': protocol,
+ 'ext': 'mp4' if is_hls else None,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('descr'),
+ 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index c84c510..86024a7 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -8,8 +8,8 @@ import itertools
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
- compat_urllib_parse,
compat_urllib_parse_unquote,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -70,7 +70,7 @@ class DaumIE(InfoExtractor):
def _real_extract(self, url):
video_id = compat_urllib_parse_unquote(self._match_id(url))
- query = compat_urllib_parse.urlencode({'vid': video_id})
+ query = compat_urllib_parse_urlencode({'vid': video_id})
movie_data = self._download_json(
'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query,
video_id, 'Downloading video formats info')
@@ -86,7 +86,7 @@ class DaumIE(InfoExtractor):
formats = []
for format_el in movie_data['output_list']['output_list']:
profile = format_el['profile']
- format_query = compat_urllib_parse.urlencode({
+ format_query = compat_urllib_parse_urlencode({
'vid': video_id,
'profile': profile,
})
diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py
index 15a1c40..efb8585 100644
--- a/youtube_dl/extractor/dcn.py
+++ b/youtube_dl/extractor/dcn.py
@@ -6,7 +6,7 @@ import base64
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_str,
)
from ..utils import (
@@ -15,11 +15,12 @@ from ..utils import (
sanitized_Request,
smuggle_url,
unsmuggle_url,
+ urlencode_postdata,
)
class DCNIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?'
def _real_extract(self, url):
show_id, video_id, season_id = re.match(self._VALID_URL, url).groups()
@@ -54,30 +55,32 @@ class DCNBaseIE(InfoExtractor):
'is_live': is_live,
}
- def _extract_video_formats(self, webpage, video_id, entry_protocol):
+ def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol):
formats = []
- m3u8_url = self._html_search_regex(
- r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False)
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None))
-
- rtsp_url = self._search_regex(
- r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False)
- if rtsp_url:
- formats.append({
- 'url': rtsp_url,
- 'format_id': 'rtsp',
- })
-
+ format_url_base = 'http' + self._html_search_regex(
+ [
+ r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8',
+ r'<a[^>]+href="rtsp(://[^"]+)"'
+ ], webpage, 'format url')
+ # TODO: Current DASH formats are broken - $Time$ pattern in
+ # <SegmentTemplate> not implemented yet
+ # formats.extend(self._extract_mpd_formats(
+ # format_url_base + '/manifest.mpd',
+ # video_id, mpd_id='dash', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ format_url_base + '/playlist.m3u8', video_id, 'mp4',
+ m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ format_url_base + '/manifest.f4m',
+ video_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
return formats
class DCNVideoIE(DCNBaseIE):
IE_NAME = 'dcn:video'
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)'
+ _TESTS = [{
'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375',
'info_dict':
{
@@ -93,7 +96,10 @@ class DCNVideoIE(DCNBaseIE):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -106,7 +112,7 @@ class DCNVideoIE(DCNBaseIE):
webpage = self._download_webpage(
'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' +
- compat_urllib_parse.urlencode({
+ compat_urllib_parse_urlencode({
'id': video_data['id'],
'user_id': video_data['user_id'],
'signature': video_data['signature'],
@@ -119,7 +125,7 @@ class DCNVideoIE(DCNBaseIE):
class DCNLiveIE(DCNBaseIE):
IE_NAME = 'dcn:live'
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)'
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -133,7 +139,7 @@ class DCNLiveIE(DCNBaseIE):
webpage = self._download_webpage(
'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' +
- compat_urllib_parse.urlencode({
+ compat_urllib_parse_urlencode({
'id': base64.b64encode(channel_data['user_id'].encode()).decode(),
'channelid': base64.b64encode(channel_data['id'].encode()).decode(),
'signature': channel_data['signature'],
@@ -146,7 +152,7 @@ class DCNLiveIE(DCNBaseIE):
class DCNSeasonIE(InfoExtractor):
IE_NAME = 'dcn:season'
- _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
+ _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))'
_TEST = {
'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A',
'info_dict':
@@ -174,7 +180,7 @@ class DCNSeasonIE(InfoExtractor):
data['show_id'] = show_id
request = sanitized_Request(
'http://admin.mangomolo.com/analytics/index.php/plus/show',
- compat_urllib_parse.urlencode(data),
+ urlencode_postdata(data),
{
'Origin': 'http://www.dcndigital.ae',
'Content-Type': 'application/x-www-form-urlencoded'
diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py
index aa2c09e..9099f50 100644
--- a/youtube_dl/extractor/dctp.py
+++ b/youtube_dl/extractor/dctp.py
@@ -6,7 +6,7 @@ from ..compat import compat_str
class DctpTvIE(InfoExtractor):
- _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P<id>.+?)/$'
+ _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P<id>.+?)/$'
_TEST = {
'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/',
'info_dict': {
diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py
index c3205ff..7a07f32 100644
--- a/youtube_dl/extractor/deezer.py
+++ b/youtube_dl/extractor/deezer.py
@@ -41,7 +41,9 @@ class DeezerPlaylistIE(InfoExtractor):
'Deezer said: %s' % geoblocking_msg, expected=True)
data_json = self._search_regex(
- r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n', webpage, 'data JSON')
+ (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*</script>',
+ r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'),
+ webpage, 'data JSON')
data = json.loads(data_json)
playlist_title = data.get('DATA', {}).get('TITLE')
diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py
index 98e3aed..9fe144e 100644
--- a/youtube_dl/extractor/defense.py
+++ b/youtube_dl/extractor/defense.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class DefenseGouvFrIE(InfoExtractor):
IE_NAME = 'defense.gouv.fr'
- _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P<id>[^/?#]*)'
+ _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P<id>[^/?#]*)'
_TEST = {
'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1',
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
index 6cd395e..65a98d7 100644
--- a/youtube_dl/extractor/democracynow.py
+++ b/youtube_dl/extractor/democracynow.py
@@ -17,37 +17,53 @@ class DemocracynowIE(InfoExtractor):
IE_NAME = 'democracynow'
_TESTS = [{
'url': 'http://www.democracynow.org/shows/2015/7/3',
- 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d',
+ 'md5': '3757c182d3d84da68f5c8f506c18c196',
'info_dict': {
'id': '2015-0703-001',
'ext': 'mp4',
- 'title': 'July 03, 2015 - Democracy Now!',
- 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs',
+ 'title': 'Daily Show',
},
}, {
'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
- 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d',
'info_dict': {
'id': '2015-0703-001',
'ext': 'mp4',
'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag',
'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21',
},
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
display_id = self._match_id(url)
+
webpage = self._download_webpage(url, display_id)
- description = self._og_search_description(webpage)
json_data = self._parse_json(self._search_regex(
r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'),
display_id)
- video_id = None
+
+ title = json_data['title']
formats = []
- default_lang = 'en'
+ video_id = None
+
+ for key in ('file', 'audio', 'video', 'high_res_video'):
+ media_url = json_data.get(key, '')
+ if not media_url:
+ continue
+ media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
+ video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
+ formats.append({
+ 'url': media_url,
+ 'vcodec': 'none' if key == 'audio' else None,
+ })
+
+ self._sort_formats(formats)
+ default_lang = 'en'
subtitles = {}
def add_subtitle_item(lang, info_dict):
@@ -67,22 +83,13 @@ class DemocracynowIE(InfoExtractor):
'url': compat_urlparse.urljoin(url, subtitle_item['url']),
})
- for key in ('file', 'audio', 'video'):
- media_url = json_data.get(key, '')
- if not media_url:
- continue
- media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url))
- video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn')
- formats.append({
- 'url': media_url,
- })
-
- self._sort_formats(formats)
+ description = self._og_search_description(webpage, default=None)
return {
'id': video_id or display_id,
- 'title': json_data['title'],
+ 'title': title,
'description': description,
+ 'thumbnail': json_data.get('image'),
'subtitles': subtitles,
'formats': formats,
}
diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py
index 263532c..a4d0448 100644
--- a/youtube_dl/extractor/dfb.py
+++ b/youtube_dl/extractor/dfb.py
@@ -12,38 +12,46 @@ class DFBIE(InfoExtractor):
_TEST = {
'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
- # The md5 is different each time
+ 'md5': 'ac0f98a52a330f700b4b3034ad240649',
'info_dict': {
'id': '11633',
'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
'upload_date': '20150714',
},
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id')
+ display_id, video_id = re.match(self._VALID_URL, url).groups()
- webpage = self._download_webpage(url, display_id)
player_info = self._download_xml(
'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
display_id)
video_info = player_info.find('video')
-
- f4m_info = self._download_xml(
- self._proto_relative_url(video_info.find('url').text.strip()), display_id)
- token_el = f4m_info.find('token')
- manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0'
- formats = self._extract_f4m_formats(manifest_url, display_id)
+ stream_access_url = self._proto_relative_url(video_info.find('url').text.strip())
+
+ formats = []
+ # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats
+ for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'):
+ stream_access_info = self._download_xml(sa_url, display_id)
+ token_el = stream_access_info.find('token')
+ manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth']
+ if '.f4m' in manifest_url:
+ formats.extend(self._extract_f4m_formats(
+ manifest_url + '&hdcore=3.2.0',
+ display_id, f4m_id='hds', fatal=False))
+ else:
+ formats.extend(self._extract_m3u8_formats(
+ manifest_url, display_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
return {
'id': video_id,
'display_id': display_id,
'title': video_info.find('title').text,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id,
'upload_date': unified_strdate(video_info.find('time_date').text),
'formats': formats,
}
diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py
index ce680a9..55853f7 100644
--- a/youtube_dl/extractor/discovery.py
+++ b/youtube_dl/extractor/discovery.py
@@ -9,7 +9,7 @@ from ..compat import compat_str
class DiscoveryIE(InfoExtractor):
- _VALID_URL = r'''(?x)http://(?:www\.)?(?:
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
discovery|
investigationdiscovery|
discoverylife|
@@ -33,6 +33,7 @@ class DiscoveryIE(InfoExtractor):
'duration': 156,
'timestamp': 1302032462,
'upload_date': '20110405',
+ 'uploader_id': '103207',
},
'params': {
'skip_download': True, # requires ffmpeg
@@ -54,7 +55,11 @@ class DiscoveryIE(InfoExtractor):
'upload_date': '20140725',
'timestamp': 1406246400,
'duration': 116,
+ 'uploader_id': '103207',
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ }
}]
def _real_extract(self, url):
@@ -63,18 +68,30 @@ class DiscoveryIE(InfoExtractor):
video_title = info.get('playlist_title') or info.get('video_title')
- entries = [{
- 'id': compat_str(video_info['id']),
- 'formats': self._extract_m3u8_formats(
- video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls',
- note='Download m3u8 information for video %d' % (idx + 1)),
- 'title': video_info['title'],
- 'description': video_info.get('description'),
- 'duration': parse_duration(video_info.get('video_length')),
- 'webpage_url': video_info.get('href') or video_info.get('url'),
- 'thumbnail': video_info.get('thumbnailURL'),
- 'alt_title': video_info.get('secondary_title'),
- 'timestamp': parse_iso8601(video_info.get('publishedDate')),
- } for idx, video_info in enumerate(info['playlist'])]
+ entries = []
+
+ for idx, video_info in enumerate(info['playlist']):
+ subtitles = {}
+ caption_url = video_info.get('captionsUrl')
+ if caption_url:
+ subtitles = {
+ 'en': [{
+ 'url': caption_url,
+ }]
+ }
+
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'],
+ 'id': compat_str(video_info['id']),
+ 'title': video_info['title'],
+ 'description': video_info.get('description'),
+ 'duration': parse_duration(video_info.get('video_length')),
+ 'webpage_url': video_info.get('href') or video_info.get('url'),
+ 'thumbnail': video_info.get('thumbnailURL'),
+ 'alt_title': video_info.get('secondary_title'),
+ 'timestamp': parse_iso8601(video_info.get('publishedDate')),
+ 'subtitles': subtitles,
+ })
return self.playlist_result(entries, display_id, video_title)
diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py
new file mode 100644
index 0000000..a78cb8a
--- /dev/null
+++ b/youtube_dl/extractor/dispeak.py
@@ -0,0 +1,114 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ remove_end,
+ xpath_element,
+ xpath_text,
+)
+
+
+class DigitallySpeakingIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
+
+ _TESTS = [{
+ # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
+ 'url': 'http://evt.dispeak.com/ubm/gdc/sf16/xml/840376_BQRC.xml',
+ 'md5': 'a8efb6c31ed06ca8739294960b2dbabd',
+ 'info_dict': {
+ 'id': '840376_BQRC',
+ 'ext': 'mp4',
+ 'title': 'Tenacious Design and The Interface of \'Destiny\'',
+ },
+ }, {
+ # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
+ 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
+ 'only_matching': True,
+ }]
+
+ def _parse_mp4(self, metadata):
+ video_formats = []
+ video_root = None
+
+ mp4_video = xpath_text(metadata, './mp4video', default=None)
+ if mp4_video is not None:
+ mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video)
+ video_root = mobj.group('root')
+ if video_root is None:
+ http_host = xpath_text(metadata, 'httpHost', default=None)
+ if http_host:
+ video_root = 'http://%s/' % http_host
+ if video_root is None:
+ # Hard-coded in http://evt.dispeak.com/ubm/gdc/sf16/custom/player2.js
+ # Works for GPUTechConf, too
+ video_root = 'http://s3-2u.digitallyspeaking.com/'
+
+ formats = metadata.findall('./MBRVideos/MBRVideo')
+ if not formats:
+ return None
+ for a_format in formats:
+ stream_name = xpath_text(a_format, 'streamName', fatal=True)
+ video_path = re.match(r'mp4\:(?P<path>.*)', stream_name).group('path')
+ url = video_root + video_path
+ vbr = xpath_text(a_format, 'bitrate')
+ video_formats.append({
+ 'url': url,
+ 'vbr': int_or_none(vbr),
+ })
+ return video_formats
+
+ def _parse_flv(self, metadata):
+ formats = []
+ akamai_url = xpath_text(metadata, './akamaiHost', fatal=True)
+ audios = metadata.findall('./audios/audio')
+ for audio in audios:
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(audio.get('url'), '.flv'),
+ 'ext': 'flv',
+ 'vcodec': 'none',
+ 'format_id': audio.get('code'),
+ })
+ slide_video_path = xpath_text(metadata, './slideVideo', fatal=True)
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(slide_video_path, '.flv'),
+ 'ext': 'flv',
+ 'format_note': 'slide deck video',
+ 'quality': -2,
+ 'preference': -2,
+ 'format_id': 'slides',
+ })
+ speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True)
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(speaker_video_path, '.flv'),
+ 'ext': 'flv',
+ 'format_note': 'speaker video',
+ 'quality': -1,
+ 'preference': -1,
+ 'format_id': 'speaker',
+ })
+ return formats
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ xml_description = self._download_xml(url, video_id)
+ metadata = xpath_element(xml_description, 'metadata')
+
+ video_formats = self._parse_mp4(metadata)
+ if video_formats is None:
+ video_formats = self._parse_flv(metadata)
+
+ return {
+ 'id': video_id,
+ 'formats': video_formats,
+ 'title': xpath_text(metadata, 'title', fatal=True),
+ 'duration': parse_duration(xpath_text(metadata, 'endTime')),
+ 'creator': xpath_text(metadata, 'speaker'),
+ }
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 373b3b4..ce69627 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -10,7 +10,7 @@ from ..compat import (compat_str, compat_basestring)
class DouyuTVIE(InfoExtractor):
IE_DESC = '斗鱼'
- _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://www.douyutv.com/iseven',
'info_dict': {
@@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):
'display_id': 'iseven',
'ext': 'flv',
'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- 'description': 'md5:c93d6692dde6fe33809a46edcbecca44',
+ 'description': 're:.*m7show@163\.com.*',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': '7师傅',
'uploader_id': '431925',
@@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
}, {
'url': 'http://www.douyutv.com/85982',
'info_dict': {
@@ -42,7 +42,27 @@ class DouyuTVIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Room not found',
+ }, {
+ 'url': 'http://www.douyutv.com/17732',
+ 'info_dict': {
+ 'id': '17732',
+ 'display_id': '17732',
+ 'ext': 'flv',
+ 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'description': 're:.*m7show@163\.com.*',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': '7师傅',
+ 'uploader_id': '431925',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.douyu.com/xiaocang',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -55,13 +75,28 @@ class DouyuTVIE(InfoExtractor):
room_id = self._html_search_regex(
r'"room_id"\s*:\s*(\d+),', page, 'room id')
- prefix = 'room/%s?aid=android&client_sys=android&time=%d' % (
- room_id, int(time.time()))
-
- auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()
- config = self._download_json(
- 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth),
- video_id)
+ config = None
+ # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache"
+ # Retry with different parameters - same parameters cause same errors
+ for i in range(5):
+ prefix = 'room/%s?aid=android&client_sys=android&time=%d' % (
+ room_id, int(time.time()))
+ auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()
+
+ config_page = self._download_webpage(
+ 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth),
+ video_id)
+ try:
+ config = self._parse_json(config_page, video_id, fatal=False)
+ except ExtractorError:
+ # Wait some time before retrying to get a different time() value
+ self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. '
+ 'Waiting for %(timeout)s seconds before retrying')
+ continue
+ else:
+ break
+ if config is None:
+ raise ExtractorError('Unable to fetch API result')
data = config['data']
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 6cda56a..5790553 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -1,51 +1,163 @@
-# encoding: utf-8
+# coding: utf-8
from __future__ import unicode_literals
+import json
+import re
import time
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ update_url_query,
+)
class DPlayIE(InfoExtractor):
- _VALID_URL = r'http://www\.dplay\.se/[^/]+/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'
- _TEST = {
+ _TESTS = [{
+ # geo restricted, via direct unsigned hls URL
+ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/',
+ 'info_dict': {
+ 'id': '1255600',
+ 'display_id': 'stagione-1-episodio-25',
+ 'ext': 'mp4',
+ 'title': 'Episodio 25',
+ 'description': 'md5:cae5f40ad988811b197d2d27a53227eb',
+ 'duration': 2761,
+ 'timestamp': 1454701800,
+ 'upload_date': '20160205',
+ 'creator': 'RTIT',
+ 'series': 'Take me out',
+ 'season_number': 1,
+ 'episode_number': 25,
+ 'age_limit': 0,
+ },
+ 'expected_warnings': ['Unable to download f4m manifest'],
+ }, {
+ # non geo restricted, via secure api, unsigned download hls URL
'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
'info_dict': {
'id': '3172',
- 'ext': 'mp4',
'display_id': 'season-1-svensken-lar-sig-njuta-av-livet',
+ 'ext': 'mp4',
'title': 'Svensken lär sig njuta av livet',
+ 'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8',
'duration': 2650,
+ 'timestamp': 1365454320,
+ 'upload_date': '20130408',
+ 'creator': 'Kanal 5 (Home)',
+ 'series': 'Nugammalt - 77 händelser som format Sverige',
+ 'season_number': 1,
+ 'episode_number': 1,
+ 'age_limit': 0,
},
- }
+ }, {
+ # geo restricted, via secure api, unsigned download hls URL
+ 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/',
+ 'info_dict': {
+ 'id': '70816',
+ 'display_id': 'season-6-episode-12',
+ 'ext': 'mp4',
+ 'title': 'Episode 12',
+ 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90',
+ 'duration': 2563,
+ 'timestamp': 1429696800,
+ 'upload_date': '20150422',
+ 'creator': 'Kanal 4 (Home)',
+ 'series': 'Mig og min mor',
+ 'season_number': 6,
+ 'episode_number': 12,
+ 'age_limit': 0,
+ },
+ }, {
+ # geo restricted, via direct unsigned hls URL
+ 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id')
+ domain = mobj.group('domain')
+
webpage = self._download_webpage(url, display_id)
+
video_id = self._search_regex(
- r'data-video-id="(\d+)"', webpage, 'video id')
+ r'data-video-id=["\'](\d+)', webpage, 'video id')
info = self._download_json(
- 'http://www.dplay.se/api/v2/ajax/videos?video_id=' + video_id,
+ 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id),
video_id)['data'][0]
- self._set_cookie(
- 'secure.dplay.se', 'dsc-geo',
- '{"countryCode":"NL","expiry":%d}' % ((time.time() + 20 * 60) * 1000))
- # TODO: consider adding support for 'stream_type=hds', it seems to
- # require setting some cookies
- manifest_url = self._download_json(
- 'https://secure.dplay.se/secure/api/v2/user/authorization/stream/%s?stream_type=hls' % video_id,
- video_id, 'Getting manifest url for hls stream')['hls']
- formats = self._extract_m3u8_formats(
- manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native')
+ title = info['title']
+
+ PROTOCOLS = ('hls', 'hds')
+ formats = []
+
+ def extract_formats(protocol, manifest_url):
+ if protocol == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ manifest_url, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)
+ # Sometimes final URLs inside m3u8 are unsigned, let's fix this
+ # ourselves
+ query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query)
+ for m3u8_format in m3u8_formats:
+ m3u8_format['url'] = update_url_query(m3u8_format['url'], query)
+ formats.extend(m3u8_formats)
+ elif protocol == 'hds':
+ formats.extend(self._extract_f4m_formats(
+ manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0',
+ video_id, f4m_id=protocol, fatal=False))
+
+ domain_tld = domain.split('.')[-1]
+ if domain_tld in ('se', 'dk', 'no'):
+ for protocol in PROTOCOLS:
+ # Providing dsc-geo allows to bypass geo restriction in some cases
+ self._set_cookie(
+ 'secure.dplay.%s' % domain_tld, 'dsc-geo',
+ json.dumps({
+ 'countryCode': domain_tld.upper(),
+ 'expiry': (time.time() + 20 * 60) * 1000,
+ }))
+ stream = self._download_json(
+ 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s'
+ % (domain_tld, video_id, protocol), video_id,
+ 'Downloading %s stream JSON' % protocol, fatal=False)
+ if stream and stream.get(protocol):
+ extract_formats(protocol, stream[protocol])
+
+ # The last resort is to try direct unsigned hls/hds URLs from info dictionary.
+ # Sometimes this does work even when secure API with dsc-geo has failed (e.g.
+ # http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/).
+ if not formats:
+ for protocol in PROTOCOLS:
+ if info.get(protocol):
+ extract_formats(protocol, info[protocol])
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for lang in ('se', 'sv', 'da', 'nl', 'no'):
+ for format_id in ('web_vtt', 'vtt', 'srt'):
+ subtitle_url = info.get('subtitles_%s_%s' % (lang, format_id))
+ if subtitle_url:
+ subtitles.setdefault(lang, []).append({'url': subtitle_url})
return {
'id': video_id,
'display_id': display_id,
- 'title': info['title'],
- 'formats': formats,
+ 'title': title,
+ 'description': info.get('video_metadata_longDescription'),
'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
+ 'timestamp': int_or_none(info.get('video_publish_date')),
+ 'creator': info.get('video_metadata_homeChannel'),
+ 'series': info.get('video_metadata_show'),
+ 'season_number': int_or_none(info.get('season')),
+ 'episode_number': int_or_none(info.get('episode')),
+ 'age_limit': int_or_none(info.get('minimum_age')),
+ 'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
index d35e888..3b6529f 100644
--- a/youtube_dl/extractor/dramafever.py
+++ b/youtube_dl/extractor/dramafever.py
@@ -6,7 +6,6 @@ import itertools
from .amp import AMPIE
from ..compat import (
compat_HTTPError,
- compat_urllib_parse,
compat_urlparse,
)
from ..utils import (
@@ -14,6 +13,7 @@ from ..utils import (
clean_html,
int_or_none,
sanitized_Request,
+ urlencode_postdata
)
@@ -50,7 +50,7 @@ class DramaFeverBaseIE(AMPIE):
}
request = sanitized_Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ self._LOGIN_URL, urlencode_postdata(login_form))
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index 028144f..0040e70 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -7,7 +7,7 @@ from .zdf import ZDFIE
class DreiSatIE(ZDFIE):
IE_NAME = '3sat'
- _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
_TESTS = [
{
'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
diff --git a/youtube_dl/extractor/dump.py b/youtube_dl/extractor/dump.py
deleted file mode 100644
index ff78d4f..0000000
--- a/youtube_dl/extractor/dump.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-
-
-class DumpIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?dump\.com/(?P<id>[a-zA-Z0-9]+)/'
-
- _TEST = {
- 'url': 'http://www.dump.com/oneus/',
- 'md5': 'ad71704d1e67dfd9e81e3e8b42d69d99',
- 'info_dict': {
- 'id': 'oneus',
- 'ext': 'flv',
- 'title': "He's one of us.",
- 'thumbnail': 're:^https?://.*\.jpg$',
- },
- }
-
- def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
-
- webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(
- r's1.addVariable\("file",\s*"([^"]+)"', webpage, 'video URL')
-
- title = self._og_search_title(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- 'id': video_id,
- 'title': title,
- 'url': video_url,
- 'thumbnail': thumbnail,
- }
diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py
index c1a4bc7..974c69d 100644
--- a/youtube_dl/extractor/dvtv.py
+++ b/youtube_dl/extractor/dvtv.py
@@ -15,7 +15,7 @@ class DVTVIE(InfoExtractor):
IE_NAME = 'dvtv'
IE_DESC = 'http://video.aktualne.cz/'
- _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})'
+ _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})'
_TESTS = [{
'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/',
diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py
new file mode 100644
index 0000000..d740652
--- /dev/null
+++ b/youtube_dl/extractor/dw.py
@@ -0,0 +1,108 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+)
+from ..compat import compat_urlparse
+
+
+class DWIE(InfoExtractor):
+ IE_NAME = 'dw'
+ _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)'
+ _TESTS = [{
+ # video
+ 'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
+ 'md5': '7372046e1815c5a534b43f3c3c36e6e9',
+ 'info_dict': {
+ 'id': '19112290',
+ 'ext': 'mp4',
+ 'title': 'Intelligent light',
+ 'description': 'md5:90e00d5881719f2a6a5827cb74985af1',
+ 'upload_date': '20160311',
+ }
+ }, {
+ # audio
+ 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941',
+ 'md5': '2814c9a1321c3a51f8a7aeb067a360dd',
+ 'info_dict': {
+ 'id': '19111941',
+ 'ext': 'mp3',
+ 'title': 'WorldLink: My business',
+ 'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
+ 'upload_date': '20160311',
+ }
+ }, {
+ # DW documentaries, only last for one or two weeks
+ 'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798',
+ 'md5': '56b6214ef463bfb9a3b71aeb886f3cf1',
+ 'info_dict': {
+ 'id': '19274438',
+ 'ext': 'mp4',
+ 'title': 'Welcome to the 90s – Hip Hop',
+ 'description': 'Welcome to the 90s - The Golden Decade of Hip Hop',
+ 'upload_date': '20160521',
+ },
+ 'skip': 'Video removed',
+ }]
+
+ def _real_extract(self, url):
+ media_id = self._match_id(url)
+ webpage = self._download_webpage(url, media_id)
+ hidden_inputs = self._hidden_inputs(webpage)
+ title = hidden_inputs['media_title']
+ media_id = hidden_inputs.get('media_id') or media_id
+
+ if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
+ formats = self._extract_smil_formats(
+ 'http://www.dw.com/smil/v-%s' % media_id, media_id,
+ transform_source=lambda s: s.replace(
+ 'rtmp://tv-od.dw.de/flash/',
+ 'http://tv-download.dw.de/dwtv_video/flv/'))
+ self._sort_formats(formats)
+ else:
+ formats = [{'url': hidden_inputs['file_name']}]
+
+ upload_date = hidden_inputs.get('display_date')
+ if not upload_date:
+ upload_date = self._html_search_regex(
+ r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage,
+ 'upload date', default=None)
+ upload_date = unified_strdate(upload_date)
+
+ return {
+ 'id': media_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': hidden_inputs.get('preview_image'),
+ 'duration': int_or_none(hidden_inputs.get('file_duration')),
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
+
+
+class DWArticleIE(InfoExtractor):
+ IE_NAME = 'dw:article'
+ _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009',
+ 'md5': '8ca657f9d068bbef74d6fc38b97fc869',
+ 'info_dict': {
+ 'id': '19105868',
+ 'ext': 'mp4',
+ 'title': 'The harsh life of refugees in Idomeni',
+ 'description': 'md5:196015cc7e48ebf474db9399420043c7',
+ 'upload_date': '20160310',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ hidden_inputs = self._hidden_inputs(webpage)
+ media_id = hidden_inputs['media_id']
+ media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url')
+ media_url = compat_urlparse.urljoin(url, media_path)
+ return self.url_result(media_url, 'DW', media_id)
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
index 7bbf617..113a496 100644
--- a/youtube_dl/extractor/eagleplatform.py
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
+ url_basename,
)
@@ -21,7 +23,7 @@ class EaglePlatformIE(InfoExtractor):
_TESTS = [{
# http://lenta.ru/news/2015/03/06/navalny/
'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201',
- 'md5': '70f5187fb620f2c1d503b3b22fd4efe3',
+ # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
'info_dict': {
'id': '227304',
'ext': 'mp4',
@@ -36,7 +38,7 @@ class EaglePlatformIE(InfoExtractor):
# http://muz-tv.ru/play/7129/
# http://media.clipyou.ru/index/player?record_id=12820&width=730&height=415&autoplay=true
'url': 'eagleplatform:media.clipyou.ru:12820',
- 'md5': '90b26344ba442c8e44aa4cf8f301164a',
+ 'md5': '358597369cf8ba56675c1df15e7af624',
'info_dict': {
'id': '12820',
'ext': 'mp4',
@@ -55,8 +57,13 @@ class EaglePlatformIE(InfoExtractor):
raise ExtractorError(' '.join(response['errors']), expected=True)
def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'):
- response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note)
- self._handle_error(response)
+ try:
+ response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note)
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError):
+ response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
+ self._handle_error(response)
+ raise
return response
def _get_video_url(self, url_or_request, video_id, note='Downloading JSON metadata'):
@@ -84,17 +91,33 @@ class EaglePlatformIE(InfoExtractor):
secure_m3u8 = self._proto_relative_url(media['sources']['secure_m3u8']['auto'], 'http:')
+ formats = []
+
m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
- formats = self._extract_m3u8_formats(
+ m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id,
'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+ formats.extend(m3u8_formats)
mp4_url = self._get_video_url(
# Secure mp4 URL is constructed according to Player.prototype.mp4 from
# http://lentaru.media.eagleplatform.com/player/player.js
re.sub(r'm3u8|hlsvod|hls|f4m', 'mp4', secure_m3u8),
video_id, 'Downloading mp4 JSON')
- formats.append({'url': mp4_url, 'format_id': 'mp4'})
+ mp4_url_basename = url_basename(mp4_url)
+ for m3u8_format in m3u8_formats:
+ mobj = re.search('/([^/]+)/index\.m3u8', m3u8_format['url'])
+ if mobj:
+ http_format = m3u8_format.copy()
+ video_url = mp4_url.replace(mp4_url_basename, mobj.group(1))
+ if not self._is_valid_url(video_url, video_id):
+ continue
+ http_format.update({
+ 'url': video_url,
+ 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(http_format)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py
index b6bfd2b..c97682c 100644
--- a/youtube_dl/extractor/ebaumsworld.py
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -4,10 +4,10 @@ from .common import InfoExtractor
class EbaumsWorldIE(InfoExtractor):
- _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.ebaumsworld.com/video/watch/83367677/',
+ 'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/',
'info_dict': {
'id': '83367677',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py
index d2d9404..6b7cc65 100644
--- a/youtube_dl/extractor/echomsk.py
+++ b/youtube_dl/extractor/echomsk.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class EchoMskIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)'
_TEST = {
'url': 'http://www.echo.msk.ru/sounds/1464134.html',
'md5': '2e44b3b78daff5b458e4dbc37f191f7c',
diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py
index 00a69e6..8c725a4 100644
--- a/youtube_dl/extractor/elpais.py
+++ b/youtube_dl/extractor/elpais.py
@@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor):
_VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
IE_DESC = 'El País'
- _TEST = {
+ _TESTS = [{
'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
'md5': '98406f301f19562170ec071b83433d55',
'info_dict': {
@@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor):
'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
'upload_date': '20140206',
}
- }
+ }, {
+ 'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t',
+ 'md5': '3bd5b09509f3519d7d9e763179b013de',
+ 'info_dict': {
+ 'id': '1456340311_668921',
+ 'ext': 'mp4',
+ 'title': 'Cómo hacer el mejor café con cafetera italiana',
+ 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.',
+ 'upload_date': '20160303',
+ }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
prefix = self._html_search_regex(
- r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
+ r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')
video_suffix = self._search_regex(
- r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
+ r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')
video_url = prefix + video_suffix
thumbnail_suffix = self._search_regex(
- r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
- fatal=False)
+ r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'",
+ webpage, 'thumbnail URL', fatal=False)
thumbnail = (
None if thumbnail_suffix is None
else prefix + thumbnail_suffix)
title = self._html_search_regex(
- '<h2 class="entry-header entry-title.*?>(.*?)</h2>',
+ (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title',
+ r'<h2 class="entry-header entry-title.*?>(.*?)</h2>'),
webpage, 'title')
- date_str = self._search_regex(
+ upload_date = unified_strdate(self._search_regex(
r'<p class="date-header date-int updated"\s+title="([^"]+)">',
- webpage, 'upload date', fatal=False)
- upload_date = (None if date_str is None else unified_strdate(date_str))
+ webpage, 'upload date', default=None) or self._html_search_meta(
+ 'datePublished', webpage, 'timestamp'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
index e418070..e5e57d4 100644
--- a/youtube_dl/extractor/engadget.py
+++ b/youtube_dl/extractor/engadget.py
@@ -1,21 +1,13 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import (
- url_basename,
-)
class EngadgetIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://www.engadget.com/
- (?:video(?:/5min)?/(?P<id>\d+)|
- [\d/]+/.*?)
- '''
+ _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.engadget.com/video/5min/518153925/',
+ 'url': 'http://www.engadget.com/video/518153925/',
'md5': 'c6820d4828a5064447a4d9fc73f312c9',
'info_dict': {
'id': '518153925',
@@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
- if video_id is not None:
- return self.url_result('5min:%s' % video_id)
- else:
- title = url_basename(url)
- webpage = self._download_webpage(url, title)
- ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
- return {
- '_type': 'playlist',
- 'title': title,
- 'entries': [self.url_result('5min:%s' % vid) for vid in ids]
- }
+ return self.url_result('5min:%s' % video_id)
diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py
index e006921..ac5d0fe 100644
--- a/youtube_dl/extractor/eporner.py
+++ b/youtube_dl/extractor/eporner.py
@@ -11,8 +11,8 @@ from ..utils import (
class EpornerIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\d+)/(?P<display_id>[\w-]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)'
+ _TESTS = [{
'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
'md5': '39d486f046212d8e1b911c52ab4691f8',
'info_dict': {
@@ -23,8 +23,12 @@ class EpornerIE(InfoExtractor):
'duration': 1838,
'view_count': int,
'age_limit': 18,
- }
- }
+ },
+ }, {
+ # New (May 2016) URL layout
+ 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
index 7fcd015..297f8a6 100644
--- a/youtube_dl/extractor/eroprofile.py
+++ b/youtube_dl/extractor/eroprofile.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
unescapeHTML
@@ -43,7 +43,7 @@ class EroProfileIE(InfoExtractor):
if username is None:
return
- query = compat_urllib_parse.urlencode({
+ query = compat_urllib_parse_urlencode({
'username': username,
'password': password,
'url': 'http://www.eroprofile.com/',
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
index db4b263..66c08be 100644
--- a/youtube_dl/extractor/espn.py
+++ b/youtube_dl/extractor/espn.py
@@ -8,6 +8,7 @@ class ESPNIE(InfoExtractor):
_VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://espn.go.com/video/clip?id=10365079',
+ 'md5': '60e5d097a523e767d06479335d1bdc58',
'info_dict': {
'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
'ext': 'mp4',
@@ -15,21 +16,22 @@ class ESPNIE(InfoExtractor):
'description': None,
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'add_ie': ['OoyalaExternal'],
}, {
# intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
'url': 'http://espn.go.com/video/clip?id=2743663',
+ 'md5': 'f4ac89b59afc7e2d7dbb049523df6768',
'info_dict': {
'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg',
'ext': 'mp4',
'title': 'Must-See Moments: Best of the MLS season',
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'add_ie': ['OoyalaExternal'],
}, {
'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
'only_matching': True,
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py
index 0c0fe6d..09ed4f2 100644
--- a/youtube_dl/extractor/exfm.py
+++ b/youtube_dl/extractor/exfm.py
@@ -8,7 +8,7 @@ from .common import InfoExtractor
class ExfmIE(InfoExtractor):
IE_NAME = 'exfm'
IE_DESC = 'ex.fm'
- _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)'
_SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream'
_TESTS = [
{
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
new file mode 100644
index 0000000..6fc5a18
--- /dev/null
+++ b/youtube_dl/extractor/extractors.py
@@ -0,0 +1,1064 @@
+# flake8: noqa
+from __future__ import unicode_literals
+
+from .abc import ABCIE
+from .abc7news import Abc7NewsIE
+from .abcnews import (
+ AbcNewsIE,
+ AbcNewsVideoIE,
+)
+from .academicearth import AcademicEarthCourseIE
+from .acast import (
+ ACastIE,
+ ACastChannelIE,
+)
+from .addanime import AddAnimeIE
+from .adobetv import (
+ AdobeTVIE,
+ AdobeTVShowIE,
+ AdobeTVChannelIE,
+ AdobeTVVideoIE,
+)
+from .adultswim import AdultSwimIE
+from .aenetworks import AENetworksIE
+from .afreecatv import AfreecaTVIE
+from .aftonbladet import AftonbladetIE
+from .airmozilla import AirMozillaIE
+from .aljazeera import AlJazeeraIE
+from .alphaporno import AlphaPornoIE
+from .animeondemand import AnimeOnDemandIE
+from .anitube import AnitubeIE
+from .anysex import AnySexIE
+from .aol import (
+ AolIE,
+ AolFeaturesIE,
+)
+from .allocine import AllocineIE
+from .aparat import AparatIE
+from .appleconnect import AppleConnectIE
+from .appletrailers import (
+ AppleTrailersIE,
+ AppleTrailersSectionIE,
+)
+from .archiveorg import ArchiveOrgIE
+from .ard import (
+ ARDIE,
+ ARDMediathekIE,
+)
+from .arte import (
+ ArteTvIE,
+ ArteTVPlus7IE,
+ ArteTVCreativeIE,
+ ArteTVConcertIE,
+ ArteTVInfoIE,
+ ArteTVFutureIE,
+ ArteTVCinemaIE,
+ ArteTVDDCIE,
+ ArteTVMagazineIE,
+ ArteTVEmbedIE,
+ ArteTVPlaylistIE,
+)
+from .atresplayer import AtresPlayerIE
+from .atttechchannel import ATTTechChannelIE
+from .audimedia import AudiMediaIE
+from .audioboom import AudioBoomIE
+from .audiomack import AudiomackIE, AudiomackAlbumIE
+from .azubu import AzubuIE, AzubuLiveIE
+from .baidu import BaiduVideoIE
+from .bambuser import BambuserIE, BambuserChannelIE
+from .bandcamp import BandcampIE, BandcampAlbumIE
+from .bbc import (
+ BBCCoUkIE,
+ BBCCoUkArticleIE,
+ BBCCoUkIPlayerPlaylistIE,
+ BBCCoUkPlaylistIE,
+ BBCIE,
+)
+from .beeg import BeegIE
+from .behindkink import BehindKinkIE
+from .beatportpro import BeatportProIE
+from .bet import BetIE
+from .bigflix import BigflixIE
+from .bild import BildIE
+from .bilibili import BiliBiliIE
+from .biobiochiletv import BioBioChileTVIE
+from .biqle import BIQLEIE
+from .bleacherreport import (
+ BleacherReportIE,
+ BleacherReportCMSIE,
+)
+from .blinkx import BlinkxIE
+from .bloomberg import BloombergIE
+from .bokecc import BokeCCIE
+from .bpb import BpbIE
+from .br import BRIE
+from .bravotv import BravoTVIE
+from .breakcom import BreakIE
+from .brightcove import (
+ BrightcoveLegacyIE,
+ BrightcoveNewIE,
+)
+from .buzzfeed import BuzzFeedIE
+from .byutv import BYUtvIE
+from .c56 import C56IE
+from .camdemy import (
+ CamdemyIE,
+ CamdemyFolderIE
+)
+from .camwithher import CamWithHerIE
+from .canalplus import CanalplusIE
+from .canalc2 import Canalc2IE
+from .canvas import CanvasIE
+from .carambatv import (
+ CarambaTVIE,
+ CarambaTVPageIE,
+)
+from .cbc import (
+ CBCIE,
+ CBCPlayerIE,
+)
+from .cbs import CBSIE
+from .cbslocal import CBSLocalIE
+from .cbsinteractive import CBSInteractiveIE
+from .cbsnews import (
+ CBSNewsIE,
+ CBSNewsLiveVideoIE,
+)
+from .cbssports import CBSSportsIE
+from .ccc import CCCIE
+from .cda import CDAIE
+from .ceskatelevize import CeskaTelevizeIE
+from .channel9 import Channel9IE
+from .chaturbate import ChaturbateIE
+from .chilloutzone import ChilloutzoneIE
+from .chirbit import (
+ ChirbitIE,
+ ChirbitProfileIE,
+)
+from .cinchcast import CinchcastIE
+from .cliprs import ClipRsIE
+from .clipfish import ClipfishIE
+from .cliphunter import CliphunterIE
+from .clipsyndicate import ClipsyndicateIE
+from .closertotruth import CloserToTruthIE
+from .cloudy import CloudyIE
+from .clubic import ClubicIE
+from .clyp import ClypIE
+from .cmt import CMTIE
+from .cnbc import CNBCIE
+from .cnn import (
+ CNNIE,
+ CNNBlogsIE,
+ CNNArticleIE,
+)
+from .coub import CoubIE
+from .collegerama import CollegeRamaIE
+from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
+from .comcarcoff import ComCarCoffIE
+from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
+from .commonprotocols import RtmpIE
+from .condenast import CondeNastIE
+from .cracked import CrackedIE
+from .crackle import CrackleIE
+from .criterion import CriterionIE
+from .crooksandliars import CrooksAndLiarsIE
+from .crunchyroll import (
+ CrunchyrollIE,
+ CrunchyrollShowPlaylistIE
+)
+from .cspan import CSpanIE
+from .ctsnews import CtsNewsIE
+from .cultureunplugged import CultureUnpluggedIE
+from .cwtv import CWTVIE
+from .dailymail import DailyMailIE
+from .dailymotion import (
+ DailymotionIE,
+ DailymotionPlaylistIE,
+ DailymotionUserIE,
+ DailymotionCloudIE,
+)
+from .daum import (
+ DaumIE,
+ DaumClipIE,
+ DaumPlaylistIE,
+ DaumUserIE,
+)
+from .dbtv import DBTVIE
+from .dcn import (
+ DCNIE,
+ DCNVideoIE,
+ DCNLiveIE,
+ DCNSeasonIE,
+)
+from .dctp import DctpTvIE
+from .deezer import DeezerPlaylistIE
+from .democracynow import DemocracynowIE
+from .dfb import DFBIE
+from .dhm import DHMIE
+from .dotsub import DotsubIE
+from .douyutv import DouyuTVIE
+from .dplay import DPlayIE
+from .dramafever import (
+ DramaFeverIE,
+ DramaFeverSeriesIE,
+)
+from .dreisat import DreiSatIE
+from .drbonanza import DRBonanzaIE
+from .drtuber import DrTuberIE
+from .drtv import DRTVIE
+from .dvtv import DVTVIE
+from .dumpert import DumpertIE
+from .defense import DefenseGouvFrIE
+from .discovery import DiscoveryIE
+from .dispeak import DigitallySpeakingIE
+from .dropbox import DropboxIE
+from .dw import (
+ DWIE,
+ DWArticleIE,
+)
+from .eagleplatform import EaglePlatformIE
+from .ebaumsworld import EbaumsWorldIE
+from .echomsk import EchoMskIE
+from .ehow import EHowIE
+from .eighttracks import EightTracksIE
+from .einthusan import EinthusanIE
+from .eitb import EitbIE
+from .ellentv import (
+ EllenTVIE,
+ EllenTVClipsIE,
+)
+from .elpais import ElPaisIE
+from .embedly import EmbedlyIE
+from .engadget import EngadgetIE
+from .eporner import EpornerIE
+from .eroprofile import EroProfileIE
+from .escapist import EscapistIE
+from .espn import ESPNIE
+from .esri import EsriVideoIE
+from .europa import EuropaIE
+from .everyonesmixtape import EveryonesMixtapeIE
+from .exfm import ExfmIE
+from .expotv import ExpoTVIE
+from .extremetube import ExtremeTubeIE
+from .eyedotv import EyedoTVIE
+from .facebook import FacebookIE
+from .faz import FazIE
+from .fc2 import FC2IE
+from .fczenit import FczenitIE
+from .firstpost import FirstpostIE
+from .firsttv import FirstTVIE
+from .fivemin import FiveMinIE
+from .fivetv import FiveTVIE
+from .fktv import FKTVIE
+from .flickr import FlickrIE
+from .folketinget import FolketingetIE
+from .footyroom import FootyRoomIE
+from .formula1 import Formula1IE
+from .fourtube import FourTubeIE
+from .fox import FOXIE
+from .foxgay import FoxgayIE
+from .foxnews import FoxNewsIE
+from .foxsports import FoxSportsIE
+from .franceculture import (
+ FranceCultureIE,
+ FranceCultureEmissionIE,
+)
+from .franceinter import FranceInterIE
+from .francetv import (
+ PluzzIE,
+ FranceTvInfoIE,
+ FranceTVIE,
+ GenerationQuoiIE,
+ CultureboxIE,
+)
+from .freesound import FreesoundIE
+from .freespeech import FreespeechIE
+from .freevideo import FreeVideoIE
+from .funimation import FunimationIE
+from .funnyordie import FunnyOrDieIE
+from .gameinformer import GameInformerIE
+from .gamekings import GamekingsIE
+from .gameone import (
+ GameOneIE,
+ GameOnePlaylistIE,
+)
+from .gamersyde import GamersydeIE
+from .gamespot import GameSpotIE
+from .gamestar import GameStarIE
+from .gazeta import GazetaIE
+from .gdcvault import GDCVaultIE
+from .generic import GenericIE
+from .gfycat import GfycatIE
+from .giantbomb import GiantBombIE
+from .giga import GigaIE
+from .glide import GlideIE
+from .globo import (
+ GloboIE,
+ GloboArticleIE,
+)
+from .godtube import GodTubeIE
+from .godtv import GodTVIE
+from .goldenmoustache import GoldenMoustacheIE
+from .golem import GolemIE
+from .googledrive import GoogleDriveIE
+from .googleplus import GooglePlusIE
+from .googlesearch import GoogleSearchIE
+from .goshgay import GoshgayIE
+from .gputechconf import GPUTechConfIE
+from .groupon import GrouponIE
+from .hark import HarkIE
+from .hbo import HBOIE
+from .hearthisat import HearThisAtIE
+from .heise import HeiseIE
+from .hellporno import HellPornoIE
+from .helsinki import HelsinkiIE
+from .hentaistigma import HentaiStigmaIE
+from .historicfilms import HistoricFilmsIE
+from .hitbox import HitboxIE, HitboxLiveIE
+from .hornbunny import HornBunnyIE
+from .hotnewhiphop import HotNewHipHopIE
+from .hotstar import HotStarIE
+from .howcast import HowcastIE
+from .howstuffworks import HowStuffWorksIE
+from .huffpost import HuffPostIE
+from .hypem import HypemIE
+from .iconosquare import IconosquareIE
+from .ign import (
+ IGNIE,
+ OneUPIE,
+ PCMagIE,
+)
+from .imdb import (
+ ImdbIE,
+ ImdbListIE
+)
+from .imgur import (
+ ImgurIE,
+ ImgurAlbumIE,
+)
+from .ina import InaIE
+from .indavideo import (
+ IndavideoIE,
+ IndavideoEmbedIE,
+)
+from .infoq import InfoQIE
+from .instagram import InstagramIE, InstagramUserIE
+from .internetvideoarchive import InternetVideoArchiveIE
+from .iprima import IPrimaIE
+from .iqiyi import IqiyiIE
+from .ir90tv import Ir90TvIE
+from .ivi import (
+ IviIE,
+ IviCompilationIE
+)
+from .ivideon import IvideonIE
+from .izlesene import IzleseneIE
+from .jeuxvideo import JeuxVideoIE
+from .jove import JoveIE
+from .jwplatform import JWPlatformIE
+from .jpopsukitv import JpopsukiIE
+from .kaltura import KalturaIE
+from .kanalplay import KanalPlayIE
+from .kankan import KankanIE
+from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
+from .keezmovies import KeezMoviesIE
+from .khanacademy import KhanAcademyIE
+from .kickstarter import KickStarterIE
+from .keek import KeekIE
+from .konserthusetplay import KonserthusetPlayIE
+from .kontrtube import KontrTubeIE
+from .krasview import KrasViewIE
+from .ku6 import Ku6IE
+from .kusi import KUSIIE
+from .kuwo import (
+ KuwoIE,
+ KuwoAlbumIE,
+ KuwoChartIE,
+ KuwoSingerIE,
+ KuwoCategoryIE,
+ KuwoMvIE,
+)
+from .la7 import LA7IE
+from .laola1tv import Laola1TvIE
+from .learnr import LearnrIE
+from .lecture2go import Lecture2GoIE
+from .lemonde import LemondeIE
+from .leeco import (
+ LeIE,
+ LePlaylistIE,
+ LetvCloudIE,
+)
+from .libraryofcongress import LibraryOfCongressIE
+from .libsyn import LibsynIE
+from .lifenews import (
+ LifeNewsIE,
+ LifeEmbedIE,
+)
+from .limelight import (
+ LimelightMediaIE,
+ LimelightChannelIE,
+ LimelightChannelListIE,
+)
+from .litv import LiTVIE
+from .liveleak import LiveLeakIE
+from .livestream import (
+ LivestreamIE,
+ LivestreamOriginalIE,
+ LivestreamShortenerIE,
+)
+from .lnkgo import LnkGoIE
+from .localnews8 import LocalNews8IE
+from .lovehomeporn import LoveHomePornIE
+from .lrt import LRTIE
+from .lynda import (
+ LyndaIE,
+ LyndaCourseIE
+)
+from .m6 import M6IE
+from .macgamestore import MacGameStoreIE
+from .mailru import MailRuIE
+from .makerschannel import MakersChannelIE
+from .makertv import MakerTVIE
+from .matchtv import MatchTVIE
+from .mdr import MDRIE
+from .metacafe import MetacafeIE
+from .metacritic import MetacriticIE
+from .mgoon import MgoonIE
+from .mgtv import MGTVIE
+from .microsoftvirtualacademy import (
+ MicrosoftVirtualAcademyIE,
+ MicrosoftVirtualAcademyCourseIE,
+)
+from .minhateca import MinhatecaIE
+from .ministrygrid import MinistryGridIE
+from .minoto import MinotoIE
+from .miomio import MioMioIE
+from .mit import TechTVMITIE, MITIE, OCWMITIE
+from .mitele import MiTeleIE
+from .mixcloud import (
+ MixcloudIE,
+ MixcloudUserIE,
+ MixcloudPlaylistIE,
+ MixcloudStreamIE,
+)
+from .mlb import MLBIE
+from .mnet import MnetIE
+from .mpora import MporaIE
+from .moevideo import MoeVideoIE
+from .mofosex import MofosexIE
+from .mojvideo import MojvideoIE
+from .moniker import MonikerIE
+from .morningstar import MorningstarIE
+from .motherless import MotherlessIE
+from .motorsport import MotorsportIE
+from .movieclips import MovieClipsIE
+from .moviezine import MoviezineIE
+from .mtv import (
+ MTVIE,
+ MTVServicesEmbeddedIE,
+ MTVIggyIE,
+ MTVDEIE,
+)
+from .muenchentv import MuenchenTVIE
+from .musicplayon import MusicPlayOnIE
+from .mwave import MwaveIE, MwaveMeetGreetIE
+from .myspace import MySpaceIE, MySpaceAlbumIE
+from .myspass import MySpassIE
+from .myvi import MyviIE
+from .myvideo import MyVideoIE
+from .myvidster import MyVidsterIE
+from .nationalgeographic import (
+ NationalGeographicIE,
+ NationalGeographicChannelIE,
+)
+from .naver import NaverIE
+from .nba import NBAIE
+from .nbc import (
+ CSNNEIE,
+ NBCIE,
+ NBCNewsIE,
+ NBCSportsIE,
+ NBCSportsVPlayerIE,
+)
+from .ndr import (
+ NDRIE,
+ NJoyIE,
+ NDREmbedBaseIE,
+ NDREmbedIE,
+ NJoyEmbedIE,
+)
+from .ndtv import NDTVIE
+from .netzkino import NetzkinoIE
+from .nerdcubed import NerdCubedFeedIE
+from .neteasemusic import (
+ NetEaseMusicIE,
+ NetEaseMusicAlbumIE,
+ NetEaseMusicSingerIE,
+ NetEaseMusicListIE,
+ NetEaseMusicMvIE,
+ NetEaseMusicProgramIE,
+ NetEaseMusicDjRadioIE,
+)
+from .newgrounds import NewgroundsIE
+from .newstube import NewstubeIE
+from .nextmedia import (
+ NextMediaIE,
+ NextMediaActionNewsIE,
+ AppleDailyIE,
+)
+from .nextmovie import NextMovieIE
+from .nfb import NFBIE
+from .nfl import NFLIE
+from .nhl import (
+ NHLVideocenterIE,
+ NHLNewsIE,
+ NHLVideocenterCategoryIE,
+ NHLIE,
+)
+from .nick import (
+ NickIE,
+ NickDeIE,
+)
+from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .ninegag import NineGagIE
+from .noco import NocoIE
+from .normalboots import NormalbootsIE
+from .nosvideo import NosVideoIE
+from .nova import NovaIE
+from .novamov import (
+ AuroraVidIE,
+ CloudTimeIE,
+ NowVideoIE,
+ VideoWeedIE,
+ WholeCloudIE,
+)
+from .nowness import (
+ NownessIE,
+ NownessPlaylistIE,
+ NownessSeriesIE,
+)
+from .nowtv import (
+ NowTVIE,
+ NowTVListIE,
+)
+from .noz import NozIE
+from .npo import (
+ NPOIE,
+ NPOLiveIE,
+ NPORadioIE,
+ NPORadioFragmentIE,
+ SchoolTVIE,
+ VPROIE,
+ WNLIE
+)
+from .npr import NprIE
+from .nrk import (
+ NRKIE,
+ NRKPlaylistIE,
+ NRKSkoleIE,
+ NRKTVIE,
+)
+from .ntvde import NTVDeIE
+from .ntvru import NTVRuIE
+from .nytimes import (
+ NYTimesIE,
+ NYTimesArticleIE,
+)
+from .nuvid import NuvidIE
+from .odnoklassniki import OdnoklassnikiIE
+from .oktoberfesttv import OktoberfestTVIE
+from .onionstudios import OnionStudiosIE
+from .ooyala import (
+ OoyalaIE,
+ OoyalaExternalIE,
+)
+from .openload import OpenloadIE
+from .ora import OraTVIE
+from .orf import (
+ ORFTVthekIE,
+ ORFOE1IE,
+ ORFFM4IE,
+ ORFIPTVIE,
+)
+from .pandoratv import PandoraTVIE
+from .parliamentliveuk import ParliamentLiveUKIE
+from .patreon import PatreonIE
+from .pbs import PBSIE
+from .people import PeopleIE
+from .periscope import (
+ PeriscopeIE,
+ PeriscopeUserIE,
+)
+from .philharmoniedeparis import PhilharmonieDeParisIE
+from .phoenix import PhoenixIE
+from .photobucket import PhotobucketIE
+from .pinkbike import PinkbikeIE
+from .pladform import PladformIE
+from .played import PlayedIE
+from .playfm import PlayFMIE
+from .plays import PlaysTVIE
+from .playtvak import PlaytvakIE
+from .playvid import PlayvidIE
+from .playwire import PlaywireIE
+from .pluralsight import (
+ PluralsightIE,
+ PluralsightCourseIE,
+)
+from .podomatic import PodomaticIE
+from .porn91 import Porn91IE
+from .pornhd import PornHdIE
+from .pornhub import (
+ PornHubIE,
+ PornHubPlaylistIE,
+ PornHubUserVideosIE,
+)
+from .pornotube import PornotubeIE
+from .pornovoisines import PornoVoisinesIE
+from .pornoxo import PornoXOIE
+from .presstv import PressTVIE
+from .primesharetv import PrimeShareTVIE
+from .promptfile import PromptFileIE
+from .prosiebensat1 import ProSiebenSat1IE
+from .puls4 import Puls4IE
+from .pyvideo import PyvideoIE
+from .qqmusic import (
+ QQMusicIE,
+ QQMusicSingerIE,
+ QQMusicAlbumIE,
+ QQMusicToplistIE,
+ QQMusicPlaylistIE,
+)
+from .r7 import (
+ R7IE,
+ R7ArticleIE,
+)
+from .radiocanada import (
+ RadioCanadaIE,
+ RadioCanadaAudioVideoIE,
+)
+from .radiode import RadioDeIE
+from .radiojavan import RadioJavanIE
+from .radiobremen import RadioBremenIE
+from .radiofrance import RadioFranceIE
+from .rai import (
+ RaiTVIE,
+ RaiIE,
+)
+from .rbmaradio import RBMARadioIE
+from .rds import RDSIE
+from .redtube import RedTubeIE
+from .regiotv import RegioTVIE
+from .restudy import RestudyIE
+from .reuters import ReutersIE
+from .reverbnation import ReverbNationIE
+from .revision3 import (
+ Revision3EmbedIE,
+ Revision3IE,
+)
+from .rice import RICEIE
+from .ringtv import RingTVIE
+from .ro220 import Ro220IE
+from .rockstargames import RockstarGamesIE
+from .rottentomatoes import RottenTomatoesIE
+from .roxwel import RoxwelIE
+from .rtbf import RTBFIE
+from .rte import RteIE, RteRadioIE
+from .rtlnl import RtlNlIE
+from .rtl2 import RTL2IE
+from .rtp import RTPIE
+from .rts import RTSIE
+from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE
+from .rtvnh import RTVNHIE
+from .ruhd import RUHDIE
+from .ruleporn import RulePornIE
+from .rutube import (
+ RutubeIE,
+ RutubeChannelIE,
+ RutubeEmbedIE,
+ RutubeMovieIE,
+ RutubePersonIE,
+)
+from .rutv import RUTVIE
+from .ruutu import RuutuIE
+from .sandia import SandiaIE
+from .safari import (
+ SafariIE,
+ SafariApiIE,
+ SafariCourseIE,
+)
+from .sapo import SapoIE
+from .savefrom import SaveFromIE
+from .sbs import SBSIE
+from .scivee import SciVeeIE
+from .screencast import ScreencastIE
+from .screencastomatic import ScreencastOMaticIE
+from .screenjunkies import ScreenJunkiesIE
+from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE
+from .seeker import SeekerIE
+from .senateisvp import SenateISVPIE
+from .sendtonews import SendtoNewsIE
+from .servingsys import ServingSysIE
+from .sexu import SexuIE
+from .shahid import ShahidIE
+from .shared import SharedIE
+from .sharesix import ShareSixIE
+from .sina import SinaIE
+from .skynewsarabia import (
+ SkyNewsArabiaIE,
+ SkyNewsArabiaArticleIE,
+)
+from .slideshare import SlideshareIE
+from .slutload import SlutloadIE
+from .smotri import (
+ SmotriIE,
+ SmotriCommunityIE,
+ SmotriUserIE,
+ SmotriBroadcastIE,
+)
+from .snotr import SnotrIE
+from .sohu import SohuIE
+from .soundcloud import (
+ SoundcloudIE,
+ SoundcloudSetIE,
+ SoundcloudUserIE,
+ SoundcloudPlaylistIE,
+ SoundcloudSearchIE
+)
+from .soundgasm import (
+ SoundgasmIE,
+ SoundgasmProfileIE
+)
+from .southpark import (
+ SouthParkIE,
+ SouthParkDeIE,
+ SouthParkDkIE,
+ SouthParkEsIE,
+ SouthParkNlIE
+)
+from .spankbang import SpankBangIE
+from .spankwire import SpankwireIE
+from .spiegel import SpiegelIE, SpiegelArticleIE
+from .spiegeltv import SpiegeltvIE
+from .spike import SpikeIE
+from .stitcher import StitcherIE
+from .sport5 import Sport5IE
+from .sportbox import (
+ SportBoxIE,
+ SportBoxEmbedIE,
+)
+from .sportdeutschland import SportDeutschlandIE
+from .sportschau import SportschauIE
+from .srgssr import (
+ SRGSSRIE,
+ SRGSSRPlayIE,
+)
+from .srmediathek import SRMediathekIE
+from .ssa import SSAIE
+from .stanfordoc import StanfordOpenClassroomIE
+from .steam import SteamIE
+from .streamcloud import StreamcloudIE
+from .streamcz import StreamCZIE
+from .streetvoice import StreetVoiceIE
+from .sunporno import SunPornoIE
+from .svt import (
+ SVTIE,
+ SVTPlayIE,
+)
+from .swrmediathek import SWRMediathekIE
+from .syfy import SyfyIE
+from .sztvhu import SztvHuIE
+from .tagesschau import (
+ TagesschauPlayerIE,
+ TagesschauIE,
+)
+from .tapely import TapelyIE
+from .tass import TassIE
+from .tdslifeway import TDSLifewayIE
+from .teachertube import (
+ TeacherTubeIE,
+ TeacherTubeUserIE,
+)
+from .teachingchannel import TeachingChannelIE
+from .teamcoco import TeamcocoIE
+from .techtalks import TechTalksIE
+from .ted import TEDIE
+from .tele13 import Tele13IE
+from .telebruxelles import TeleBruxellesIE
+from .telecinco import TelecincoIE
+from .telegraaf import TelegraafIE
+from .telemb import TeleMBIE
+from .teletask import TeleTaskIE
+from .telewebion import TelewebionIE
+from .testurl import TestURLIE
+from .tf1 import TF1IE
+from .theintercept import TheInterceptIE
+from .theplatform import (
+ ThePlatformIE,
+ ThePlatformFeedIE,
+)
+from .thescene import TheSceneIE
+from .thesixtyone import TheSixtyOneIE
+from .thestar import TheStarIE
+from .thisamericanlife import ThisAmericanLifeIE
+from .thisav import ThisAVIE
+from .threeqsdn import ThreeQSDNIE
+from .tinypic import TinyPicIE
+from .tlc import TlcDeIE
+from .tmz import (
+ TMZIE,
+ TMZArticleIE,
+)
+from .tnaflix import (
+ TNAFlixNetworkEmbedIE,
+ TNAFlixIE,
+ EMPFlixIE,
+ MovieFapIE,
+)
+from .toggle import ToggleIE
+from .thvideo import (
+ THVideoIE,
+ THVideoPlaylistIE
+)
+from .toutv import TouTvIE
+from .toypics import ToypicsUserIE, ToypicsIE
+from .traileraddict import TrailerAddictIE
+from .trilulilu import TriluliluIE
+from .trollvids import TrollvidsIE
+from .trutube import TruTubeIE
+from .tube8 import Tube8IE
+from .tubitv import TubiTvIE
+from .tudou import (
+ TudouIE,
+ TudouPlaylistIE,
+ TudouAlbumIE,
+)
+from .tumblr import TumblrIE
+from .tunein import (
+ TuneInClipIE,
+ TuneInStationIE,
+ TuneInProgramIE,
+ TuneInTopicIE,
+ TuneInShortenerIE,
+)
+from .turbo import TurboIE
+from .tutv import TutvIE
+from .tv2 import (
+ TV2IE,
+ TV2ArticleIE,
+)
+from .tv3 import TV3IE
+from .tv4 import TV4IE
+from .tvc import (
+ TVCIE,
+ TVCArticleIE,
+)
+from .tvigle import TvigleIE
+from .tvland import TVLandIE
+from .tvp import (
+ TVPIE,
+ TVPSeriesIE,
+)
+from .tvplay import TVPlayIE
+from .tweakers import TweakersIE
+from .twentyfourvideo import TwentyFourVideoIE
+from .twentymin import TwentyMinutenIE
+from .twentytwotracks import (
+ TwentyTwoTracksIE,
+ TwentyTwoTracksGenreIE
+)
+from .twitch import (
+ TwitchVideoIE,
+ TwitchChapterIE,
+ TwitchVodIE,
+ TwitchProfileIE,
+ TwitchPastBroadcastsIE,
+ TwitchStreamIE,
+ TwitchClipsIE,
+)
+from .twitter import (
+ TwitterCardIE,
+ TwitterIE,
+ TwitterAmplifyIE,
+)
+from .udemy import (
+ UdemyIE,
+ UdemyCourseIE
+)
+from .udn import UDNEmbedIE
+from .digiteka import DigitekaIE
+from .unistra import UnistraIE
+from .urort import UrortIE
+from .usatoday import USATodayIE
+from .ustream import UstreamIE, UstreamChannelIE
+from .ustudio import (
+ UstudioIE,
+ UstudioEmbedIE,
+)
+from .varzesh3 import Varzesh3IE
+from .vbox7 import Vbox7IE
+from .veehd import VeeHDIE
+from .veoh import VeohIE
+from .vessel import VesselIE
+from .vesti import VestiIE
+from .vevo import (
+ VevoIE,
+ VevoPlaylistIE,
+)
+from .vgtv import (
+ BTArticleIE,
+ BTVestlendingenIE,
+ VGTVIE,
+)
+from .vh1 import VH1IE
+from .vice import (
+ ViceIE,
+ ViceShowIE,
+)
+from .viddler import ViddlerIE
+from .videodetective import VideoDetectiveIE
+from .videofyme import VideofyMeIE
+from .videomega import VideoMegaIE
+from .videomore import (
+ VideomoreIE,
+ VideomoreVideoIE,
+ VideomoreSeasonIE,
+)
+from .videopremium import VideoPremiumIE
+from .videott import VideoTtIE
+from .vidio import VidioIE
+from .vidme import (
+ VidmeIE,
+ VidmeUserIE,
+ VidmeUserLikesIE,
+)
+from .vidzi import VidziIE
+from .vier import VierIE, VierVideosIE
+from .viewlift import (
+ ViewLiftIE,
+ ViewLiftEmbedIE,
+)
+from .viewster import ViewsterIE
+from .viidea import ViideaIE
+from .vimeo import (
+ VimeoIE,
+ VimeoAlbumIE,
+ VimeoChannelIE,
+ VimeoGroupsIE,
+ VimeoLikesIE,
+ VimeoOndemandIE,
+ VimeoReviewIE,
+ VimeoUserIE,
+ VimeoWatchLaterIE,
+)
+from .vimple import VimpleIE
+from .vine import (
+ VineIE,
+ VineUserIE,
+)
+from .viki import (
+ VikiIE,
+ VikiChannelIE,
+)
+from .vk import (
+ VKIE,
+ VKUserVideosIE,
+)
+from .vlive import VLiveIE
+from .vodlocker import VodlockerIE
+from .voicerepublic import VoiceRepublicIE
+from .voxmedia import VoxMediaIE
+from .vporn import VpornIE
+from .vrt import VRTIE
+from .vube import VubeIE
+from .vuclip import VuClipIE
+from .walla import WallaIE
+from .washingtonpost import (
+ WashingtonPostIE,
+ WashingtonPostArticleIE,
+)
+from .wat import WatIE
+from .watchindianporn import WatchIndianPornIE
+from .wdr import (
+ WDRIE,
+ WDRMobileIE,
+)
+from .webofstories import (
+ WebOfStoriesIE,
+ WebOfStoriesPlaylistIE,
+)
+from .weiqitv import WeiqiTVIE
+from .wimp import WimpIE
+from .wistia import WistiaIE
+from .worldstarhiphop import WorldStarHipHopIE
+from .wrzuta import (
+ WrzutaIE,
+ WrzutaPlaylistIE,
+)
+from .wsj import WSJIE
+from .xbef import XBefIE
+from .xboxclips import XboxClipsIE
+from .xfileshare import XFileShareIE
+from .xhamster import (
+ XHamsterIE,
+ XHamsterEmbedIE,
+)
+from .xiami import (
+ XiamiSongIE,
+ XiamiAlbumIE,
+ XiamiArtistIE,
+ XiamiCollectionIE
+)
+from .xminus import XMinusIE
+from .xnxx import XNXXIE
+from .xstream import XstreamIE
+from .xtube import XTubeUserIE, XTubeIE
+from .xuite import XuiteIE
+from .xvideos import XVideosIE
+from .xxxymovies import XXXYMoviesIE
+from .yahoo import (
+ YahooIE,
+ YahooSearchIE,
+)
+from .yam import YamIE
+from .yandexmusic import (
+ YandexMusicTrackIE,
+ YandexMusicAlbumIE,
+ YandexMusicPlaylistIE,
+)
+from .yesjapan import YesJapanIE
+from .yinyuetai import YinYueTaiIE
+from .ynet import YnetIE
+from .youjizz import YouJizzIE
+from .youku import (
+ YoukuIE,
+ YoukuShowIE,
+)
+from .youporn import YouPornIE
+from .yourupload import YourUploadIE
+from .youtube import (
+ YoutubeIE,
+ YoutubeChannelIE,
+ YoutubeFavouritesIE,
+ YoutubeHistoryIE,
+ YoutubeLiveIE,
+ YoutubePlaylistIE,
+ YoutubePlaylistsIE,
+ YoutubeRecommendedIE,
+ YoutubeSearchDateIE,
+ YoutubeSearchIE,
+ YoutubeSearchURLIE,
+ YoutubeShowIE,
+ YoutubeSubscriptionsIE,
+ YoutubeTruncatedIDIE,
+ YoutubeTruncatedURLIE,
+ YoutubeUserIE,
+ YoutubeWatchLaterIE,
+)
+from .zapiks import ZapiksIE
+from .zdf import ZDFIE, ZDFChannelIE
+from .zingmp3 import (
+ ZingMp3SongIE,
+ ZingMp3AlbumIE,
+)
+from .zippcast import ZippCastIE
diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py
new file mode 100644
index 0000000..2f30351
--- /dev/null
+++ b/youtube_dl/extractor/eyedotv.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ xpath_text,
+ parse_duration,
+ ExtractorError,
+)
+
+
+class EyedoTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?eyedo\.tv/[^/]+/(?:#!/)?Live/Detail/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.eyedo.tv/en-US/#!/Live/Detail/16301',
+ 'md5': 'ba14f17995cdfc20c36ba40e21bf73f7',
+ 'info_dict': {
+ 'id': '16301',
+ 'ext': 'mp4',
+ 'title': 'Journée du conseil scientifique de l\'Afnic 2015',
+ 'description': 'md5:4abe07293b2f73efc6e1c37028d58c98',
+ 'uploader': 'Afnic Live',
+ 'uploader_id': '8023',
+ }
+ }
+ _ROOT_URL = 'http://live.eyedo.net:1935/'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_data = self._download_xml('http://eyedo.tv/api/live/GetLive/%s' % video_id, video_id)
+
+ def _add_ns(path):
+ return self._xpath_ns(path, 'http://schemas.datacontract.org/2004/07/EyeDo.Core.Implementation.Web.ViewModels.Api')
+
+ title = xpath_text(video_data, _add_ns('Titre'), 'title', True)
+ state_live_code = xpath_text(video_data, _add_ns('StateLiveCode'), 'title', True)
+ if state_live_code == 'avenir':
+ raise ExtractorError(
+ '%s said: We\'re sorry, but this video is not yet available.' % self.IE_NAME,
+ expected=True)
+
+ is_live = state_live_code == 'live'
+ m3u8_url = None
+ # http://eyedo.tv/Content/Html5/Scripts/html5view.js
+ if is_live:
+ if xpath_text(video_data, 'Cdn') == 'true':
+ m3u8_url = 'http://rrr.sz.xlcdn.com/?account=eyedo&file=A%s&type=live&service=wowza&protocol=http&output=playlist.m3u8' % video_id
+ else:
+ m3u8_url = self._ROOT_URL + 'w/%s/eyedo_720p/playlist.m3u8' % video_id
+ else:
+ m3u8_url = self._ROOT_URL + 'replay-w/%s/mp4:%s.mp4/playlist.m3u8' % (video_id, video_id)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'),
+ 'description': xpath_text(video_data, _add_ns('Description')),
+ 'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),
+ 'uploader': xpath_text(video_data, _add_ns('Createur')),
+ 'uploader_id': xpath_text(video_data, _add_ns('CreateurId')),
+ 'chapter': xpath_text(video_data, _add_ns('ChapitreTitre')),
+ 'chapter_id': xpath_text(video_data, _add_ns('ChapitreId')),
+ }
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index 0a9a5ca..9b87b37 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -34,9 +34,12 @@ class FacebookIE(InfoExtractor):
video/video\.php|
photo\.php|
video\.php|
- video/embed
- )\?(?:.*?)(?:v|video_id)=|
- [^/]+/videos/(?:[^/]+/)?
+ video/embed|
+ story\.php
+ )\?(?:.*?)(?:v|video_id|story_fbid)=|
+ [^/]+/videos/(?:[^/]+/)?|
+ [^/]+/posts/|
+ groups/[^/]+/permalink/
)|
facebook:
)
@@ -49,6 +52,8 @@ class FacebookIE(InfoExtractor):
_CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36'
+ _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
+
_TESTS = [{
'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
'md5': '6a40d33c0eccbb1af76cf0485a052659',
@@ -81,6 +86,33 @@ class FacebookIE(InfoExtractor):
'uploader': 'Demy de Zeeuw',
},
}, {
+ 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
+ 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
+ 'info_dict': {
+ 'id': '544765982287235',
+ 'ext': 'mp4',
+ 'title': '"What are you doing running in the snow?"',
+ 'uploader': 'FailArmy',
+ }
+ }, {
+ 'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
+ 'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
+ 'info_dict': {
+ 'id': '1035862816472149',
+ 'ext': 'mp4',
+ 'title': 'What the Flock Is Going On In New Zealand Credit: ViralHog',
+ 'uploader': 'S. Saint',
+ },
+ }, {
+ 'note': 'swf params escaped',
+ 'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
+ 'md5': '97ba073838964d12c70566e0085c2b91',
+ 'info_dict': {
+ 'id': '10153664894881749',
+ 'ext': 'mp4',
+ 'title': 'Facebook video #10153664894881749',
+ },
+ }, {
'url': 'https://www.facebook.com/video.php?v=10204634152394104',
'only_matching': True,
}, {
@@ -92,6 +124,9 @@ class FacebookIE(InfoExtractor):
}, {
'url': 'facebook:544765982287235',
'only_matching': True,
+ }, {
+ 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
+ 'only_matching': True,
}]
def _login(self):
@@ -160,19 +195,19 @@ class FacebookIE(InfoExtractor):
def _real_initialize(self):
self._login()
- def _real_extract(self, url):
- video_id = self._match_id(url)
- req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
+ def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
+ req = sanitized_Request(url)
req.add_header('User-Agent', self._CHROME_USER_AGENT)
webpage = self._download_webpage(req, video_id)
video_data = None
- BEFORE = '{swf.addParam(param[0], param[1]);});\n'
+ BEFORE = '{swf.addParam(param[0], param[1]);});'
AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
- m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
+ m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
if m:
- data = dict(json.loads(m.group(1)))
+ swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+ data = dict(json.loads(swf_params))
params_raw = compat_urllib_parse_unquote(data['params'])
video_data = json.loads(params_raw)['video_data']
@@ -185,13 +220,15 @@ class FacebookIE(InfoExtractor):
if not video_data:
server_js_data = self._parse_json(self._search_regex(
- r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id)
+ r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id)
for item in server_js_data.get('instances', []):
if item[1][0] == 'VideoConfig':
video_data = video_data_list2dict(item[2][0]['videoData'])
break
if not video_data:
+ if not fatal_if_no_video:
+ return webpage, False
m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
if m_msg is not None:
raise ExtractorError(
@@ -202,16 +239,21 @@ class FacebookIE(InfoExtractor):
formats = []
for format_id, f in video_data.items():
+ if f and isinstance(f, dict):
+ f = [f]
if not f or not isinstance(f, list):
continue
for quality in ('sd', 'hd'):
for src_type in ('src', 'src_no_ratelimit'):
src = f[0].get('%s_%s' % (quality, src_type))
if src:
+ preference = -10 if format_id == 'progressive' else 0
+ if quality == 'hd':
+ preference += 5
formats.append({
'format_id': '%s_%s_%s' % (format_id, quality, src_type),
'url': src,
- 'preference': -10 if format_id == 'progressive' else 0,
+ 'preference': preference,
})
dash_manifest = f[0].get('dash_manifest')
if dash_manifest:
@@ -234,39 +276,36 @@ class FacebookIE(InfoExtractor):
video_title = 'Facebook video #%s' % video_id
uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))
- return {
+ info_dict = {
'id': video_id,
'title': video_title,
'formats': formats,
'uploader': uploader,
}
-
-class FacebookPostIE(InfoExtractor):
- IE_NAME = 'facebook:post'
- _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
- 'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
- 'info_dict': {
- 'id': '544765982287235',
- 'ext': 'mp4',
- 'title': '"What are you doing running in the snow?"',
- 'uploader': 'FailArmy',
- }
- }
+ return webpage, info_dict
def _real_extract(self, url):
- post_id = self._match_id(url)
+ video_id = self._match_id(url)
+
+ real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
+ webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False)
- webpage = self._download_webpage(url, post_id)
+ if info_dict:
+ return info_dict
- entries = [
- self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
- for video_id in self._parse_json(
- self._search_regex(
- r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
- webpage, 'video ids', group='ids'),
- post_id)]
+ if '/posts/' in url:
+ entries = [
+ self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
+ for vid in self._parse_json(
+ self._search_regex(
+ r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
+ webpage, 'video ids', group='ids'),
+ video_id)]
- return self.playlist_result(entries, post_id)
+ return self.playlist_result(entries, video_id)
+ else:
+ _, info_dict = self._extract_from_url(
+ self._VIDEO_PAGE_TEMPLATE % video_id,
+ video_id, fatal_if_no_video=True)
+ return info_dict
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
index 9580f5c..c7d69ff 100644
--- a/youtube_dl/extractor/fc2.py
+++ b/youtube_dl/extractor/fc2.py
@@ -5,19 +5,18 @@ import hashlib
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_urllib_request,
compat_urlparse,
)
from ..utils import (
- encode_dict,
ExtractorError,
sanitized_Request,
+ urlencode_postdata,
)
class FC2IE(InfoExtractor):
- _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)'
+ _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)'
IE_NAME = 'fc2'
_NETRC_MACHINE = 'fc2'
_TESTS = [{
@@ -57,7 +56,7 @@ class FC2IE(InfoExtractor):
'Submit': ' Login ',
}
- login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8')
+ login_data = urlencode_postdata(login_form_strs)
request = sanitized_Request(
'https://secure.id.fc2.com/index.php?mode=login&switch_language=en', login_data)
diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py
index f1f150e..8d1010b 100644
--- a/youtube_dl/extractor/fczenit.py
+++ b/youtube_dl/extractor/fczenit.py
@@ -1,20 +1,19 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import compat_urlparse
class FczenitIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://fc-zenit.ru/video/gl6785/',
- 'md5': '458bacc24549173fe5a5aa29174a5606',
+ 'url': 'http://fc-zenit.ru/video/41044/',
+ 'md5': '0e3fab421b455e970fa1aa3891e57df0',
'info_dict': {
- 'id': '6785',
+ 'id': '41044',
'ext': 'mp4',
- 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»',
+ 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
},
}
@@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+ video_title = self._html_search_regex(
+ r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
+
+ video_items = self._parse_json(self._search_regex(
+ r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
+ video_id)
- bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL')
- bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw)
+ def merge_dicts(*dicts):
+ ret = {}
+ for a_dict in dicts:
+ ret.update(a_dict)
+ return ret
formats = [{
- 'url': furl,
- 'tbr': tbr,
- } for furl, tbr in bitrates]
+ 'url': compat_urlparse.urljoin(url, video_url),
+ 'tbr': int(tbr),
+ } for tbr, video_url in merge_dicts(*video_items).items()]
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py
index 298227d..e8936cb 100644
--- a/youtube_dl/extractor/firstpost.py
+++ b/youtube_dl/extractor/firstpost.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class FirstpostIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html',
diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py
index 510d4b1..88bca10 100644
--- a/youtube_dl/extractor/firsttv.py
+++ b/youtube_dl/extractor/firsttv.py
@@ -2,78 +2,133 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..compat import compat_xpath
+from ..utils import (
+ int_or_none,
+ qualities,
+ unified_strdate,
+ xpath_attr,
+ xpath_element,
+ xpath_text,
+ xpath_with_ns,
+)
class FirstTVIE(InfoExtractor):
IE_NAME = '1tv'
IE_DESC = 'Первый канал'
- _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://www.1tv.ru/videoarchive/73390',
- 'md5': '777f525feeec4806130f4f764bc18a4f',
+ # single format via video_materials.json API
+ 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930',
+ 'md5': '82a2777648acae812d58b3f5bd42882b',
'info_dict': {
- 'id': '73390',
+ 'id': '35930',
'ext': 'mp4',
- 'title': 'Олимпийские канатные дороги',
- 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015',
+ 'description': 'md5:357933adeede13b202c7c21f91b871b2',
'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',
- 'duration': 149,
- 'like_count': int,
- 'dislike_count': int,
+ 'upload_date': '20150212',
+ 'duration': 2694,
},
- 'skip': 'Only works from Russia',
}, {
- 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930',
- 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf',
+ # multiple formats via video_materials.json API
+ 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641',
'info_dict': {
- 'id': '35930',
+ 'id': '113641',
'ext': 'mp4',
- 'title': 'Наедине со всеми. Людмила Сенчина',
- 'description': 'md5:89553aed1d641416001fe8d450f06cb9',
+ 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016',
+ 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2',
'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',
- 'duration': 2694,
+ 'upload_date': '20160407',
+ 'duration': 179,
+ 'formats': 'mincount:3',
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'skip': 'Only works from Russia',
+ }, {
+ # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API
+ 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038',
+ 'md5': '519d306c5b5669761fd8906c39dbee23',
+ 'info_dict': {
+ 'id': '47038',
+ 'ext': 'mp4',
+ 'title': '"Побег". Второй сезон. 3 серия',
+ 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b',
+ 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$',
+ 'upload_date': '20120516',
+ 'duration': 3080,
+ },
+ }, {
+ 'url': 'http://www.1tv.ru/videoarchive/9967',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id, 'Downloading page')
+ # Videos with multiple formats only available via this API
+ video = self._download_json(
+ 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id,
+ video_id, fatal=False)
- video_url = self._html_search_regex(
- r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''',
- webpage, 'video URL')
+ description, thumbnail, upload_date, duration = [None] * 4
- title = self._html_search_regex(
- [r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>',
- r"'title'\s*:\s*'([^']+)'"], webpage, 'title')
- description = self._html_search_regex(
- r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>',
- webpage, 'description', default=None) or self._html_search_meta(
- 'description', webpage, 'description')
+ if video:
+ item = video[0]
+ title = item['title']
+ quality = qualities(('ld', 'sd', 'hd', ))
+ formats = [{
+ 'url': f['src'],
+ 'format_id': f.get('name'),
+ 'quality': quality(f.get('name')),
+ } for f in item['mbr'] if f.get('src')]
+ thumbnail = item.get('poster')
+ else:
+ # Some videos are not available via video_materials.json
+ video = self._download_xml(
+ 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id,
+ video_id)
+
+ NS_MAP = {
+ 'media': 'http://search.yahoo.com/mrss/',
+ }
- thumbnail = self._og_search_thumbnail(webpage)
- duration = self._og_search_property(
- 'video:duration', webpage,
- 'video duration', fatal=False)
+ item = xpath_element(video, './channel/item', fatal=True)
+ title = xpath_text(item, './title', fatal=True)
+ formats = [{
+ 'url': content.attrib['url'],
+ } for content in item.findall(
+ compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')]
+ thumbnail = xpath_attr(
+ item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url')
- like_count = self._html_search_regex(
- r'title="Понравилось".*?/></label> \[(\d+)\]',
- webpage, 'like count', default=None)
- dislike_count = self._html_search_regex(
- r'title="Не понравилось".*?/></label> \[(\d+)\]',
- webpage, 'dislike count', default=None)
+ self._sort_formats(formats)
+
+ webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False)
+ if webpage:
+ title = self._html_search_regex(
+ (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>',
+ r"'title'\s*:\s*'([^']+)'"),
+ webpage, 'title', default=None) or title
+ description = self._html_search_regex(
+ r'<div class="descr">\s*<div>&nbsp;</div>\s*<p>([^<]*)</p></div>',
+ webpage, 'description', default=None) or self._html_search_meta(
+ 'description', webpage, 'description')
+ thumbnail = thumbnail or self._og_search_thumbnail(webpage)
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'video duration', fatal=False))
+ upload_date = unified_strdate(self._html_search_meta(
+ 'ya:ovs:upload_date', webpage, 'upload date', fatal=False))
return {
'id': video_id,
- 'url': video_url,
'thumbnail': thumbnail,
'title': title,
'description': description,
+ 'upload_date': upload_date,
'duration': int_or_none(duration),
- 'like_count': int_or_none(like_count),
- 'dislike_count': int_or_none(dislike_count),
+ 'formats': formats
}
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
index 2955965..6b83454 100644
--- a/youtube_dl/extractor/fivemin.py
+++ b/youtube_dl/extractor/fivemin.py
@@ -1,9 +1,11 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_parse_qs,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urlparse,
)
@@ -16,12 +18,7 @@ from ..utils import (
class FiveMinIE(InfoExtractor):
IE_NAME = '5min'
- _VALID_URL = r'''(?x)
- (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=|
- https?://(?:(?:massively|www)\.)?joystiq\.com/video/|
- 5min:)
- (?P<id>\d+)
- '''
+ _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'
_TESTS = [
{
@@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor):
'title': 'How to Make a Next-Level Fruit Salad',
'duration': 184,
},
+ 'skip': 'no longer available',
},
]
_ERRORS = {
@@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ sid = mobj.group('sid')
+
+ if mobj.group('query'):
+ qs = compat_parse_qs(mobj.group('query'))
+ if not qs.get('playList'):
+ raise ExtractorError('Invalid URL', expected=True)
+ video_id = qs['playList'][0]
+ if qs.get('sid'):
+ sid = qs['sid'][0]
+
embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
- embed_page = self._download_webpage(embed_url, video_id,
- 'Downloading embed page')
- sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
- query = compat_urllib_parse.urlencode({
- 'func': 'GetResults',
- 'playlist': video_id,
- 'sid': sid,
- 'isPlayerSeed': 'true',
- 'url': embed_url,
- })
+ if not sid:
+ embed_page = self._download_webpage(embed_url, video_id,
+ 'Downloading embed page')
+ sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
+
response = self._download_json(
- 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
+ 'https://syn.5min.com/handlers/SenseHandler.ashx?' +
+ compat_urllib_parse_urlencode({
+ 'func': 'GetResults',
+ 'playlist': video_id,
+ 'sid': sid,
+ 'isPlayerSeed': 'true',
+ 'url': embed_url,
+ }),
video_id)
if not response['success']:
raise ExtractorError(
@@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor):
parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
for rendition in info['Renditions']:
- if rendition['RenditionType'] == 'm3u8':
- formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls'))
- elif rendition['RenditionType'] == 'aac':
+ if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':
continue
else:
rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py
index 5f6e65d..a3a2915 100644
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -10,7 +10,7 @@ from ..utils import (
class FKTVIE(InfoExtractor):
IE_NAME = 'fernsehkritik.tv'
- _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'
+ _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'
_TEST = {
'url': 'http://fernsehkritik.tv/folge-1',
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py
index 18f439d..a8e1bf4 100644
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -1,7 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
int_or_none,
@@ -24,13 +24,28 @@ class FlickrIE(InfoExtractor):
'upload_date': '20110423',
'uploader_id': '10922353@N03',
'uploader': 'Forest Wander',
+ 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/',
'comment_count': int,
'view_count': int,
'tags': list,
+ 'license': 'Attribution-ShareAlike',
}
}
-
_API_BASE_URL = 'https://api.flickr.com/services/rest?'
+ # https://help.yahoo.com/kb/flickr/SLN25525.html
+ _LICENSES = {
+ '0': 'All Rights Reserved',
+ '1': 'Attribution-NonCommercial-ShareAlike',
+ '2': 'Attribution-NonCommercial',
+ '3': 'Attribution-NonCommercial-NoDerivs',
+ '4': 'Attribution',
+ '5': 'Attribution-ShareAlike',
+ '6': 'Attribution-NoDerivs',
+ '7': 'No known copyright restrictions',
+ '8': 'United States government work',
+ '9': 'Public Domain Dedication (CC0)',
+ '10': 'Public Domain Work',
+ }
def _call_api(self, method, video_id, api_key, note, secret=None):
query = {
@@ -42,7 +57,7 @@ class FlickrIE(InfoExtractor):
}
if secret:
query['secret'] = secret
- data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note)
+ data = self._download_json(self._API_BASE_URL + compat_urllib_parse_urlencode(query), video_id, note)
if data['stat'] != 'ok':
raise ExtractorError(data['message'])
return data
@@ -75,6 +90,9 @@ class FlickrIE(InfoExtractor):
self._sort_formats(formats)
owner = video_info.get('owner', {})
+ uploader_id = owner.get('nsid')
+ uploader_path = owner.get('path_alias') or uploader_id
+ uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None
return {
'id': video_id,
@@ -83,11 +101,13 @@ class FlickrIE(InfoExtractor):
'formats': formats,
'timestamp': int_or_none(video_info.get('dateuploaded')),
'duration': int_or_none(video_info.get('video', {}).get('duration')),
- 'uploader_id': owner.get('nsid'),
+ 'uploader_id': uploader_id,
'uploader': owner.get('realname'),
+ 'uploader_url': uploader_url,
'comment_count': int_or_none(video_info.get('comments', {}).get('_content')),
'view_count': int_or_none(video_info.get('views')),
- 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])]
+ 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])],
+ 'license': self._LICENSES.get(video_info.get('license')),
}
else:
raise ExtractorError('not a video', expected=True)
diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py
index 370fd00..d2503ae 100644
--- a/youtube_dl/extractor/footyroom.py
+++ b/youtube_dl/extractor/footyroom.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class FootyRoomIE(InfoExtractor):
- _VALID_URL = r'http://footyroom\.com/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://footyroom\.com/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/',
'info_dict': {
diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py
new file mode 100644
index 0000000..322c41e
--- /dev/null
+++ b/youtube_dl/extractor/formula1.py
@@ -0,0 +1,26 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class Formula1IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
+ _TEST = {
+ 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
+ 'md5': '8c79e54be72078b26b89e0e111c0502b',
+ 'info_dict': {
+ 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
+ 'ext': 'flv',
+ 'title': 'Race highlights - Spain 2016',
+ },
+ 'add_ie': ['Ooyala'],
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ ooyala_embed_code = self._search_regex(
+ r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
+ return self.url_result(
+ 'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code)
diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py
index fa05af5..95c1abf 100644
--- a/youtube_dl/extractor/fox.py
+++ b/youtube_dl/extractor/fox.py
@@ -16,6 +16,9 @@ class FOXIE(InfoExtractor):
'title': 'Official Trailer: Gotham',
'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.',
'duration': 129,
+ 'timestamp': 1400020798,
+ 'upload_date': '20140513',
+ 'uploader': 'NEWA-FNG-FOXCOM',
},
'add_ie': ['ThePlatform'],
}
diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py
index 08b8ea3..70c1a81 100644
--- a/youtube_dl/extractor/foxgay.py
+++ b/youtube_dl/extractor/foxgay.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class FoxgayIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml'
+ _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml'
_TEST = {
'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml',
'md5': '80d72beab5d04e1655a56ad37afe6841',
diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py
index 318ac01..b04da24 100644
--- a/youtube_dl/extractor/foxnews.py
+++ b/youtube_dl/extractor/foxnews.py
@@ -18,8 +18,8 @@ class FoxNewsIE(AMPIE):
'title': 'Frozen in Time',
'description': '16-year-old girl is size of toddler',
'duration': 265,
- # 'timestamp': 1304411491,
- # 'upload_date': '20110503',
+ 'timestamp': 1304411491,
+ 'upload_date': '20110503',
'thumbnail': 're:^https?://.*\.jpg$',
},
},
@@ -32,10 +32,14 @@ class FoxNewsIE(AMPIE):
'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal",
'description': "Congressman discusses president's plan",
'duration': 292,
- # 'timestamp': 1417662047,
- # 'upload_date': '20141204',
+ 'timestamp': 1417662047,
+ 'upload_date': '20141204',
'thumbnail': 're:^https?://.*\.jpg$',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
index df76651..a3bb983 100644
--- a/youtube_dl/extractor/foxsports.py
+++ b/youtube_dl/extractor/foxsports.py
@@ -1,7 +1,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..utils import (
+ smuggle_url,
+ update_url_query,
+)
class FoxSportsIE(InfoExtractor):
@@ -9,11 +12,15 @@ class FoxSportsIE(InfoExtractor):
_TEST = {
'url': 'http://www.foxsports.com/video?vid=432609859715',
+ 'md5': 'b49050e955bebe32c301972e4012ac17',
'info_dict': {
- 'id': 'gA0bHB3Ladz3',
- 'ext': 'flv',
+ 'id': 'i0qKWsk3qJaM',
+ 'ext': 'mp4',
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
'description': 'Courtney Lee talks about Memphis being focused.',
+ 'upload_date': '20150423',
+ 'timestamp': 1429761109,
+ 'uploader': 'NEWA-FNG-FOXSPORTS',
},
'add_ie': ['ThePlatform'],
}
@@ -28,5 +35,8 @@ class FoxSportsIE(InfoExtractor):
r"data-player-config='([^']+)'", webpage, 'data player config'),
video_id)
- return self.url_result(smuggle_url(
- config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True}))
+ return self.url_result(smuggle_url(update_url_query(
+ config['releaseURL'], {
+ 'mbr': 'true',
+ 'switch': 'http',
+ }), {'force_smil_url': True}))
diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py
index 0388ba0..2369f86 100644
--- a/youtube_dl/extractor/franceinter.py
+++ b/youtube_dl/extractor/franceinter.py
@@ -6,7 +6,7 @@ from ..utils import int_or_none
class FranceInterIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.franceinter.fr/player/reecouter?play=793962',
'md5': '4764932e466e6f6c79c317d2e74f6884',
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 3f4ac30..ad94e31 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -60,28 +60,31 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
video_id, 'Downloading f4m manifest token', fatal=False)
if f4m_url:
formats.extend(self._extract_f4m_formats(
- f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id))
+ f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
+ video_id, f4m_id=format_id, fatal=False))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id))
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
'format_id': 'rtmp-%s' % format_id,
'ext': 'flv',
- 'preference': 1,
})
else:
- formats.append({
- 'url': video_url,
- 'format_id': format_id,
- 'preference': -1,
- })
+ if self._is_valid_url(video_url, video_id, format_id):
+ formats.append({
+ 'url': video_url,
+ 'format_id': format_id,
+ })
self._sort_formats(formats)
title = info['titre']
subtitle = info.get('sous_titre')
if subtitle:
title += ' - %s' % subtitle
+ title = title.strip()
subtitles = {}
subtitles_list = [{
@@ -125,13 +128,13 @@ class PluzzIE(FranceTVBaseInfoExtractor):
class FranceTvInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
- _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P<title>.+)\.html'
+ _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html'
_TESTS = [{
'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
'info_dict': {
'id': '84981923',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Soir 3',
'upload_date': '20130826',
'timestamp': 1377548400,
@@ -139,6 +142,10 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'fr': 'mincount:2',
},
},
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
'info_dict': {
@@ -155,11 +162,32 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html',
'md5': 'f485bda6e185e7d15dbc69b72bae993e',
'info_dict': {
- 'id': '556e03339473995ee145930c',
+ 'id': 'NI_173343',
'ext': 'mp4',
'title': 'Les entreprises familiales : le secret de la réussite',
'thumbnail': 're:^https?://.*\.jpe?g$',
- }
+ 'timestamp': 1433273139,
+ 'upload_date': '20150602',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html',
+ 'md5': 'f485bda6e185e7d15dbc69b72bae993e',
+ 'info_dict': {
+ 'id': 'NI_657393',
+ 'ext': 'mp4',
+ 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"',
+ 'description': 'md5:a3264114c9d29aeca11ced113c37b16c',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
+ 'timestamp': 1458300695,
+ 'upload_date': '20160318',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -172,7 +200,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
return self.url_result(dmcloud_url, 'DailymotionCloud')
video_id, catalogue = self._search_regex(
- r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@')
+ (r'id-video=([^@]+@[^"]+)',
+ r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'),
+ webpage, 'video id').split('@')
return self._extract_video(video_id, catalogue)
diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py
index c210177..1477708 100644
--- a/youtube_dl/extractor/freespeech.py
+++ b/youtube_dl/extractor/freespeech.py
@@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor):
'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0',
'info_dict': {
'id': 'poKsVCZ64uU',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Obama, Romney Campaign in Colorado Ahead of Debate',
'description': 'Obama, Romney Campaign in Colorado Ahead of Debate',
'uploader': 'freespeechtv',
diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py
index c7bec02..cd8423a 100644
--- a/youtube_dl/extractor/freevideo.py
+++ b/youtube_dl/extractor/freevideo.py
@@ -5,7 +5,7 @@ from ..utils import ExtractorError
class FreeVideoIE(InfoExtractor):
- _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])'
+ _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])'
_TEST = {
'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html',
diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py
index 0f37ed7..0ad0d9b 100644
--- a/youtube_dl/extractor/funimation.py
+++ b/youtube_dl/extractor/funimation.py
@@ -2,10 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_urllib_parse_unquote_plus,
+)
from ..utils import (
clean_html,
determine_ext,
- encode_dict,
int_or_none,
sanitized_Request,
ExtractorError,
@@ -28,6 +31,7 @@ class FunimationIE(InfoExtractor):
'description': 'md5:1769f43cd5fc130ace8fd87232207892',
'thumbnail': 're:https?://.*\.jpg',
},
+ 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
}, {
'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
'info_dict': {
@@ -38,6 +42,7 @@ class FunimationIE(InfoExtractor):
'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
'thumbnail': 're:https?://.*\.jpg',
},
+ 'skip': 'Access without user interaction is forbidden by CloudFlare',
}, {
'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
'info_dict': {
@@ -48,18 +53,49 @@ class FunimationIE(InfoExtractor):
'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': 're:https?://.*\.(?:jpg|png)',
},
+ 'skip': 'Access without user interaction is forbidden by CloudFlare',
}]
+ _LOGIN_URL = 'http://www.funimation.com/login'
+
+ def _download_webpage(self, *args, **kwargs):
+ try:
+ return super(FunimationIE, self)._download_webpage(*args, **kwargs)
+ except ExtractorError as ee:
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+ response = ee.cause.read()
+ if b'>Please complete the security check to access<' in response:
+ raise ExtractorError(
+ 'Access to funimation.com is blocked by CloudFlare. '
+ 'Please browse to http://www.funimation.com/, solve '
+ 'the reCAPTCHA, export browser cookies to a text file,'
+ ' and then try again with --cookies YOUR_COOKIE_FILE.',
+ expected=True)
+ raise
+
+ def _extract_cloudflare_session_ua(self, url):
+ ci_session_cookie = self._get_cookies(url).get('ci_session')
+ if ci_session_cookie:
+ ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
+ # ci_session is a string serialized by PHP function serialize()
+ # This case is simple enough to use regular expressions only
+ return self._search_regex(
+ r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
+ default=None)
+
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
- data = urlencode_postdata(encode_dict({
+ data = urlencode_postdata({
'email_field': username,
'password_field': password,
- }))
- login_request = sanitized_Request('http://www.funimation.com/login', data, headers={
- 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0',
+ })
+ user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
+ if not user_agent:
+ user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
+ login_request = sanitized_Request(self._LOGIN_URL, data, headers={
+ 'User-Agent': user_agent,
'Content-Type': 'application/x-www-form-urlencoded'
})
login_page = self._download_webpage(
@@ -104,11 +140,16 @@ class FunimationIE(InfoExtractor):
('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'),
)
+ user_agent = self._extract_cloudflare_session_ua(url)
+ if user_agent:
+ USER_AGENTS = ((None, user_agent),)
+
for kind, user_agent in USER_AGENTS:
request = sanitized_Request(url)
request.add_header('User-Agent', user_agent)
webpage = self._download_webpage(
- request, display_id, 'Downloading %s webpage' % kind)
+ request, display_id,
+ 'Downloading %s webpage' % kind if kind else 'Downloading webpage')
playlist = self._parse_json(
self._search_regex(
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 4c4a87e..8c5ffc9 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -46,8 +46,8 @@ class FunnyOrDieIE(InfoExtractor):
links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
m3u8_url = self._search_regex(
- r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8)\1',
- webpage, 'm3u8 url', default=None, group='url')
+ r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1',
+ webpage, 'm3u8 url', group='url')
formats = []
diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py
index 25870c1..a66e309 100644
--- a/youtube_dl/extractor/gameinformer.py
+++ b/youtube_dl/extractor/gameinformer.py
@@ -2,42 +2,27 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import int_or_none
class GameInformerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P<id>.+)\.aspx'
_TEST = {
'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx',
+ 'md5': '292f26da1ab4beb4c9099f1304d2b071',
'info_dict': {
'id': '4515472681001',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': 'Replay - Animal Crossing',
'description': 'md5:2e211891b215c85d061adc7a4dd2d930',
- 'timestamp': 1443457610706,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'timestamp': 1443457610,
+ 'upload_date': '20150928',
+ 'uploader_id': '694940074001',
},
}
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
-
- bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url')
- json_data = self._download_json(
- bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions',
- display_id)
-
- return {
- 'id': compat_str(json_data['id']),
- 'display_id': display_id,
- 'url': json_data['IOSRenditions'][0]['url'],
- 'title': json_data['name'],
- 'description': json_data.get('shortDescription'),
- 'timestamp': int_or_none(json_data.get('publishedDate')),
- 'duration': int_or_none(json_data.get('length')),
- }
+ brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id')
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py
index f6b9046..cbcddcb 100644
--- a/youtube_dl/extractor/gamekings.py
+++ b/youtube_dl/extractor/gamekings.py
@@ -10,7 +10,7 @@ from .youtube import YoutubeIE
class GamekingsIE(InfoExtractor):
- _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)'
_TESTS = [{
# YouTube embed video
'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/',
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index b3f1baf..621257c 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -1,20 +1,20 @@
from __future__ import unicode_literals
import re
-import json
-from .common import InfoExtractor
+from .once import OnceIE
from ..compat import (
compat_urllib_parse_unquote,
- compat_urlparse,
)
from ..utils import (
unescapeHTML,
+ url_basename,
+ dict_get,
)
-class GameSpotIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
+class GameSpotIE(OnceIE):
+ _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?'
_TESTS = [{
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
@@ -39,29 +39,73 @@ class GameSpotIE(InfoExtractor):
webpage = self._download_webpage(url, page_id)
data_video_json = self._search_regex(
r'data-video=["\'](.*?)["\']', webpage, 'data video')
- data_video = json.loads(unescapeHTML(data_video_json))
+ data_video = self._parse_json(unescapeHTML(data_video_json), page_id)
streams = data_video['videoStreams']
+ manifest_url = None
formats = []
f4m_url = streams.get('f4m_stream')
- if f4m_url is not None:
- # Transform the manifest url to a link to the mp4 files
- # they are used in mobile devices.
- f4m_path = compat_urlparse.urlparse(f4m_url).path
- QUALITIES_RE = r'((,\d+)+,?)'
- qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',')
- http_path = f4m_path[1:].split('/', 1)[1]
- http_template = re.sub(QUALITIES_RE, r'%s', http_path)
- http_template = http_template.replace('.csmil/manifest.f4m', '')
- http_template = compat_urlparse.urljoin(
- 'http://video.gamespotcdn.com/', http_template)
- for q in qualities:
- formats.append({
- 'url': http_template % q,
- 'ext': 'mp4',
- 'format_id': q,
- })
- else:
+ if f4m_url:
+ manifest_url = f4m_url
+ formats.extend(self._extract_f4m_formats(
+ f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False))
+ m3u8_url = streams.get('m3u8_stream')
+ if m3u8_url:
+ manifest_url = m3u8_url
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, page_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_formats)
+ progressive_url = dict_get(
+ streams, ('progressive_hd', 'progressive_high', 'progressive_low'))
+ if progressive_url and manifest_url:
+ qualities_basename = self._search_regex(
+ '/([^/]+)\.csmil/',
+ manifest_url, 'qualities basename', default=None)
+ if qualities_basename:
+ QUALITIES_RE = r'((,\d+)+,?)'
+ qualities = self._search_regex(
+ QUALITIES_RE, qualities_basename,
+ 'qualities', default=None)
+ if qualities:
+ qualities = list(map(lambda q: int(q), qualities.strip(',').split(',')))
+ qualities.sort()
+ http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename)
+ http_url_basename = url_basename(progressive_url)
+ if m3u8_formats:
+ self._sort_formats(m3u8_formats)
+ m3u8_formats = list(filter(
+ lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ m3u8_formats))
+ if len(qualities) == len(m3u8_formats):
+ for q, m3u8_format in zip(qualities, m3u8_formats):
+ f = m3u8_format.copy()
+ f.update({
+ 'url': progressive_url.replace(
+ http_url_basename, http_template % q),
+ 'format_id': f['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(f)
+ else:
+ for q in qualities:
+ formats.append({
+ 'url': progressive_url.replace(
+ http_url_basename, http_template % q),
+ 'ext': 'mp4',
+ 'format_id': 'http-%d' % q,
+ 'tbr': q,
+ })
+
+ onceux_json = self._search_regex(
+ r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None)
+ if onceux_json:
+ onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri')
+ if onceux_url:
+ formats.extend(self._extract_once_formats(re.sub(
+ r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', '')))
+
+ if not formats:
for quality in ['sd', 'hd']:
# It's actually a link to a flv file
flv_url = streams.get('f4m_{0}'.format(quality))
@@ -71,6 +115,7 @@ class GameSpotIE(InfoExtractor):
'ext': 'flv',
'format_id': quality,
})
+ self._sort_formats(formats)
return {
'id': data_video['guid'],
diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py
index 590ccf5..69058a5 100644
--- a/youtube_dl/extractor/gamestar.py
+++ b/youtube_dl/extractor/gamestar.py
@@ -13,7 +13,7 @@ from ..utils import (
class GameStarIE(InfoExtractor):
- _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html'
+ _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html'
_TEST = {
'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html',
'md5': '96974ecbb7fd8d0d20fca5a00810cea7',
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py
deleted file mode 100644
index c3f031d..0000000
--- a/youtube_dl/extractor/gametrailers.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_age_limit,
- url_basename,
-)
-
-
-class GametrailersIE(InfoExtractor):
- _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)'
-
- _TEST = {
- 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review',
- 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a',
- 'info_dict': {
- 'id': '2983958',
- 'ext': 'mp4',
- 'display_id': '116437-Just-Cause-3-Review',
- 'title': 'Just Cause 3 - Review',
- 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?',
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- title = self._html_search_regex(
- r'<title>(.+?)\|', webpage, 'title').strip()
- embed_url = self._proto_relative_url(
- self._search_regex(
- r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage,
- 'embed url'),
- scheme='http:')
- video_id = url_basename(embed_url)
- embed_page = self._download_webpage(embed_url, video_id)
- embed_vars_json = self._search_regex(
- r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page,
- 'embed vars')
- info = self._parse_json(embed_vars_json, video_id)
-
- formats = []
- for media in info['media']:
- if media['mediaPurpose'] == 'play':
- formats.append({
- 'url': media['uri'],
- 'height': media['height'],
- 'width:': media['width'],
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': info.get('thumbUri'),
- 'description': self._og_search_description(webpage),
- 'duration': int_or_none(info.get('videoLengthInSeconds')),
- 'age_limit': parse_age_limit(info.get('audienceRating')),
- }
diff --git a/youtube_dl/extractor/gazeta.py b/youtube_dl/extractor/gazeta.py
index ea32b62..18ef5c2 100644
--- a/youtube_dl/extractor/gazeta.py
+++ b/youtube_dl/extractor/gazeta.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class GazetaIE(InfoExtractor):
- _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:main/)*(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
_TESTS = [{
'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml',
'md5': 'd49c9bdc6e5a7888f27475dc215ee789',
@@ -18,9 +18,19 @@ class GazetaIE(InfoExtractor):
'description': 'md5:38617526050bd17b234728e7f9620a71',
'thumbnail': 're:^https?://.*\.jpg',
},
+ 'skip': 'video not found',
}, {
'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml',
'only_matching': True,
+ }, {
+ 'url': 'http://www.gazeta.ru/video/main/main/2015/06/22/platit_ili_ne_platit_po_isku_yukosa.shtml',
+ 'md5': '37f19f78355eb2f4256ee1688359f24c',
+ 'info_dict': {
+ 'id': '252048',
+ 'ext': 'mp4',
+ 'title': '"Если по иску ЮКОСа придется платить, это будет большой удар по бюджету"',
+ },
+ 'add_ie': ['EaglePlatform'],
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 3befd3e..3136427 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -3,11 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
- remove_end,
HEADRequest,
sanitized_Request,
+ urlencode_postdata,
)
@@ -51,63 +50,33 @@ class GDCVaultIE(InfoExtractor):
{
'url': 'http://gdcvault.com/play/1020791/',
'only_matching': True,
- }
+ },
+ {
+ # Hard-coded hostname
+ 'url': 'http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface',
+ 'md5': 'a8efb6c31ed06ca8739294960b2dbabd',
+ 'info_dict': {
+ 'id': '1023460',
+ 'ext': 'mp4',
+ 'display_id': 'Tenacious-Design-and-The-Interface',
+ 'title': 'Tenacious Design and The Interface of \'Destiny\'',
+ },
+ },
+ {
+ # Multiple audios
+ 'url': 'http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC',
+ 'info_dict': {
+ 'id': '1014631',
+ 'ext': 'flv',
+ 'title': 'How to Create a Good Game - From My Experience of Designing Pac-Man',
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ 'format': 'jp', # The japanese audio
+ }
+ },
]
- def _parse_mp4(self, xml_description):
- video_formats = []
- mp4_video = xml_description.find('./metadata/mp4video')
- if mp4_video is None:
- return None
-
- mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
- video_root = mobj.group('root')
- formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
- for format in formats:
- mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
- url = video_root + mobj.group('path')
- vbr = format.find('bitrate').text
- video_formats.append({
- 'url': url,
- 'vbr': int(vbr),
- })
- return video_formats
-
- def _parse_flv(self, xml_description):
- formats = []
- akamai_url = xml_description.find('./metadata/akamaiHost').text
- audios = xml_description.find('./metadata/audios')
- if audios is not None:
- for audio in audios:
- formats.append({
- 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
- 'play_path': remove_end(audio.get('url'), '.flv'),
- 'ext': 'flv',
- 'vcodec': 'none',
- 'format_id': audio.get('code'),
- })
- slide_video_path = xml_description.find('./metadata/slideVideo').text
- formats.append({
- 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
- 'play_path': remove_end(slide_video_path, '.flv'),
- 'ext': 'flv',
- 'format_note': 'slide deck video',
- 'quality': -2,
- 'preference': -2,
- 'format_id': 'slides',
- })
- speaker_video_path = xml_description.find('./metadata/speakerVideo').text
- formats.append({
- 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
- 'play_path': remove_end(speaker_video_path, '.flv'),
- 'ext': 'flv',
- 'format_note': 'speaker video',
- 'quality': -1,
- 'preference': -1,
- 'format_id': 'speaker',
- })
- return formats
-
def _login(self, webpage_url, display_id):
(username, password) = self._get_login_info()
if username is None or password is None:
@@ -123,7 +92,7 @@ class GDCVaultIE(InfoExtractor):
'password': password,
}
- request = sanitized_Request(login_url, compat_urllib_parse.urlencode(login_form))
+ request = sanitized_Request(login_url, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self._download_webpage(request, display_id, 'Logging in')
start_page = self._download_webpage(webpage_url, display_id, 'Getting authenticated video page')
@@ -159,9 +128,10 @@ class GDCVaultIE(InfoExtractor):
'title': title,
}
+ PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>'
+
xml_root = self._html_search_regex(
- r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>',
- start_page, 'xml root', default=None)
+ PLAYER_REGEX, start_page, 'xml root', default=None)
if xml_root is None:
# Probably need to authenticate
login_res = self._login(webpage_url, display_id)
@@ -171,27 +141,21 @@ class GDCVaultIE(InfoExtractor):
start_page = login_res
# Grab the url from the authenticated page
xml_root = self._html_search_regex(
- r'<iframe src="(.*?)player.html.*?".*?</iframe>',
- start_page, 'xml root')
+ PLAYER_REGEX, start_page, 'xml root')
xml_name = self._html_search_regex(
r'<iframe src=".*?\?xml=(.+?\.xml).*?".*?</iframe>',
start_page, 'xml filename', default=None)
if xml_name is None:
# Fallback to the older format
- xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
-
- xml_description_url = xml_root + 'xml/' + xml_name
- xml_description = self._download_xml(xml_description_url, display_id)
-
- video_title = xml_description.find('./metadata/title').text
- video_formats = self._parse_mp4(xml_description)
- if video_formats is None:
- video_formats = self._parse_flv(xml_description)
+ xml_name = self._html_search_regex(
+ r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>',
+ start_page, 'xml filename')
return {
+ '_type': 'url_transparent',
'id': video_id,
'display_id': display_id,
- 'title': video_title,
- 'formats': video_formats,
+ 'url': '%s/xml/%s' % (xml_root, xml_name),
+ 'ie_key': 'DigitallySpeaking',
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index c6bf8d2..4aa2406 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -47,10 +47,11 @@ from .senateisvp import SenateISVPIE
from .svt import SVTIE
from .pornhub import PornHubIE
from .xhamster import XHamsterEmbedIE
+from .tnaflix import TNAFlixNetworkEmbedIE
from .vimeo import VimeoIE
from .dailymotion import DailymotionCloudIE
from .onionstudios import OnionStudiosIE
-from .snagfilms import SnagFilmsEmbedIE
+from .viewlift import ViewLiftEmbedIE
from .screenwavemedia import ScreenwaveMediaIE
from .mtv import MTVServicesEmbeddedIE
from .pladform import PladformIE
@@ -58,6 +59,11 @@ from .videomore import VideomoreIE
from .googledrive import GoogleDriveIE
from .jwplatform import JWPlatformIE
from .digiteka import DigitekaIE
+from .instagram import InstagramIE
+from .liveleak import LiveLeakIE
+from .threeqsdn import ThreeQSDNIE
+from .theplatform import ThePlatformIE
+from .vessel import VesselIE
class GenericIE(InfoExtractor):
@@ -102,7 +108,8 @@ class GenericIE(InfoExtractor):
'skip_download': True, # infinite live stream
},
'expected_warnings': [
- r'501.*Not Implemented'
+ r'501.*Not Implemented',
+ r'400.*Bad Request',
],
},
# Direct link with incorrect MIME type
@@ -233,11 +240,41 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'title': 'car-20120827-manifest',
'formats': 'mincount:9',
+ 'upload_date': '20130904',
},
'params': {
'format': 'bestvideo',
},
},
+ # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
+ {
+ 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8',
+ 'info_dict': {
+ 'id': 'content',
+ 'ext': 'mp4',
+ 'title': 'content',
+ 'formats': 'mincount:8',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ }
+ },
+ # m3u8 served with Content-Type: text/plain
+ {
+ 'url': 'http://www.nacentapps.com/m3u8/index.m3u8',
+ 'info_dict': {
+ 'id': 'index',
+ 'ext': 'mp4',
+ 'title': 'index',
+ 'upload_date': '20140720',
+ 'formats': 'mincount:11',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ }
+ },
# google redirect
{
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@ -375,19 +412,6 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
- # multiple ooyala embeds on SBN network websites
- {
- 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
- 'info_dict': {
- 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
- 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
- },
- 'playlist_mincount': 3,
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Ooyala'],
- },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -576,7 +600,11 @@ class GenericIE(InfoExtractor):
'id': 'k2mm4bCdJ6CQ2i7c8o2',
'ext': 'mp4',
'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
+ 'description': 'md5:faf028e48a461b8b7fad38f1e104b119',
'uploader': 'Spi0n',
+ 'uploader_id': 'xgditw',
+ 'upload_date': '20140425',
+ 'timestamp': 1398441542,
},
'add_ie': ['Dailymotion'],
},
@@ -599,13 +627,13 @@ class GenericIE(InfoExtractor):
},
# MTVSercices embed
{
- 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
- 'md5': '35727f82f58c76d996fc188f9755b0d5',
+ 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',
+ 'md5': 'ca1aef97695ef2c1d6973256a57e5252',
'info_dict': {
- 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
+ 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1',
'ext': 'mp4',
- 'title': 'Review',
- 'description': 'Mario\'s life in the fast lane has never looked so good.',
+ 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored',
+ 'description': 'Two valets share their love for movie star Liam Neesons.',
},
},
# YouTube embed via <data-embed-url="">
@@ -691,15 +719,18 @@ class GenericIE(InfoExtractor):
},
# Wistia embed
{
- 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
- 'md5': '8788b683c777a5cf25621eaf286d0c23',
+ 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
+ 'md5': '1953f3a698ab51cfc948ed3992a0b7ff',
'info_dict': {
- 'id': '1cfaf6b7ea',
+ 'id': '6e2wtrbdaf',
'ext': 'mov',
- 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
- 'duration': 643.0,
- 'filesize': 182808282,
- 'uploader': 'education-portal.com',
+ 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england',
+ 'description': 'a Paywall Videos video from Remilon',
+ 'duration': 644.072,
+ 'uploader': 'study.com',
+ 'timestamp': 1459678540,
+ 'upload_date': '20160403',
+ 'filesize': 24687186,
},
},
{
@@ -708,10 +739,29 @@ class GenericIE(InfoExtractor):
'info_dict': {
'id': 'uxjb0lwrcz',
'ext': 'mp4',
- 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
+ 'title': 'Conversation about Hexagonal Rails Part 1',
+ 'description': 'a Martin Fowler video from ThoughtWorks',
'duration': 1715.0,
'uploader': 'thoughtworks.wistia.com',
+ 'timestamp': 1401832161,
+ 'upload_date': '20140603',
+ },
+ },
+ # Wistia standard embed (async)
+ {
+ 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/',
+ 'info_dict': {
+ 'id': '807fafadvk',
+ 'ext': 'mp4',
+ 'title': 'Drip Brennan Dunn Workshop',
+ 'description': 'a JV Webinars video from getdrip-1',
+ 'duration': 4986.95,
+ 'timestamp': 1463607249,
+ 'upload_date': '20160518',
},
+ 'params': {
+ 'skip_download': True,
+ }
},
# Soundcloud embed
{
@@ -735,6 +785,19 @@ class GenericIE(InfoExtractor):
'title': 'Rosetta #CometLanding webcast HL 10',
}
},
+ # Another Livestream embed, without 'new.' in URL
+ {
+ 'url': 'https://www.freespeech.org/',
+ 'info_dict': {
+ 'id': '123537347',
+ 'ext': 'mp4',
+ 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ },
+ 'params': {
+ # Live stream
+ 'skip_download': True,
+ },
+ },
# LazyYT
{
'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
@@ -819,18 +882,6 @@ class GenericIE(InfoExtractor):
'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
}
},
- # Kaltura embed
- {
- 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
- 'info_dict': {
- 'id': '1_eergr3h1',
- 'ext': 'mp4',
- 'upload_date': '20150226',
- 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
- 'timestamp': int,
- 'title': 'John Carlson Postgame 2/25/15',
- },
- },
# Kaltura embed (different embed code)
{
'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014',
@@ -856,9 +907,23 @@ class GenericIE(InfoExtractor):
'uploader_id': 'echojecka',
},
},
+ # Kaltura embed with single quotes
+ {
+ 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY',
+ 'info_dict': {
+ 'id': '0_izeg5utt',
+ 'ext': 'mp4',
+ 'title': '35871',
+ 'timestamp': 1355743100,
+ 'upload_date': '20121217',
+ 'uploader_id': 'batchUser',
+ },
+ 'add_ie': ['Kaltura'],
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+ # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
'info_dict': {
'id': '227304',
'ext': 'mp4',
@@ -873,6 +938,7 @@ class GenericIE(InfoExtractor):
# ClipYou (Eagle.Platform) embed (custom URL)
{
'url': 'http://muz-tv.ru/play/7129/',
+ # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
'info_dict': {
'id': '12820',
'ext': 'mp4',
@@ -961,18 +1027,36 @@ class GenericIE(InfoExtractor):
'ext': 'flv',
'title': "PFT Live: New leader in the 'new-look' defense",
'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+ 'uploader': 'NBCU-SPORTS',
+ 'upload_date': '20140107',
+ 'timestamp': 1389118457,
+ },
+ },
+ # NBC News embed
+ {
+ 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html',
+ 'md5': '1aa589c675898ae6d37a17913cf68d66',
+ 'info_dict': {
+ 'id': '701714499682',
+ 'ext': 'mp4',
+ 'title': 'PREVIEW: On Assignment: David Letterman',
+ 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.',
},
},
# UDN embed
{
- 'url': 'http://www.udn.com/news/story/7314/822787',
+ 'url': 'https://video.udn.com/news/300346',
'md5': 'fd2060e988c326991037b9aff9df21a6',
'info_dict': {
'id': '300346',
'ext': 'mp4',
'title': '中一中男師變性 全校師生力挺',
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
# Ooyala embed
{
@@ -989,20 +1073,6 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
- # Contains a SMIL manifest
- {
- 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
- 'info_dict': {
- 'id': 'file',
- 'ext': 'flv',
- 'title': '+ Football: Lottery Champions League Europe',
- 'uploader': 'www.telewebion.com',
- },
- 'params': {
- # rtmpe downloads
- 'skip_download': True,
- }
- },
# Brightcove URL in single quotes
{
'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
@@ -1013,6 +1083,9 @@ class GenericIE(InfoExtractor):
'title': 'SN Presents: Russell Martin, World Citizen',
'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
'uploader': 'Rogers Sportsnet',
+ 'uploader_id': '1704050871',
+ 'upload_date': '20150525',
+ 'timestamp': 1432570283,
},
},
# Dailymotion Cloud video
@@ -1093,7 +1166,60 @@ class GenericIE(InfoExtractor):
# m3u8 downloads
'skip_download': True,
}
- }
+ },
+ # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions'
+ # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm
+ {
+ 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html',
+ 'info_dict': {
+ 'id': '4785848093001',
+ 'ext': 'mp4',
+ 'title': 'The Cardinal Pell Interview',
+ 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ',
+ 'uploader': 'GlobeCast Australia - GlobeStream',
+ 'uploader_id': '2733773828001',
+ 'upload_date': '20160304',
+ 'timestamp': 1457083087,
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ },
+ # Another form of arte.tv embed
+ {
+ 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
+ 'md5': '850bfe45417ddf221288c88a0cffe2e2',
+ 'info_dict': {
+ 'id': '030273-562_PLUS7-F',
+ 'ext': 'mp4',
+ 'title': 'ARTE Reportage - Nulle part, en France',
+ 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d',
+ 'upload_date': '20160409',
+ },
+ },
+ # LiveLeak embed
+ {
+ 'url': 'http://www.wykop.pl/link/3088787/',
+ 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+ 'info_dict': {
+ 'id': '874_1459135191',
+ 'ext': 'mp4',
+ 'title': 'Man shows poor quality of new apartment building',
+ 'description': 'The wall is like a sand pile.',
+ 'uploader': 'Lake8737',
+ }
+ },
+ # Duplicated embedded video URLs
+ {
+ 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
+ 'info_dict': {
+ 'id': '149298443_480_16c25b74_2',
+ 'ext': 'mp4',
+ 'title': 'vs. Blue Orange Spring Game',
+ 'uploader': 'www.hudl.com',
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -1241,28 +1367,31 @@ class GenericIE(InfoExtractor):
full_response = self._request_webpage(request, video_id)
head_response = full_response
+ info_dict = {
+ 'id': video_id,
+ 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ 'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+ }
+
# Check for direct link to a video
- content_type = head_response.headers.get('Content-Type', '')
- m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
+ content_type = head_response.headers.get('Content-Type', '').lower()
+ m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m:
- upload_date = unified_strdate(
- head_response.headers.get('Last-Modified'))
- formats = []
- if m.group('format_id').endswith('mpegurl'):
+ format_id = m.group('format_id')
+ if format_id.endswith('mpegurl'):
formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+ elif format_id == 'f4m':
+ formats = self._extract_f4m_formats(url, video_id)
else:
formats = [{
'format_id': m.group('format_id'),
'url': url,
'vcodec': 'none' if m.group('type') == 'audio' else None
}]
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
- 'direct': True,
- 'formats': formats,
- 'upload_date': upload_date,
- }
+ info_dict['direct'] = True
+ self._sort_formats(formats)
+ info_dict['formats'] = formats
+ return info_dict
if not self._downloader.params.get('test', False) and not is_intentional:
force = self._downloader.params.get('force_generic_extractor', False)
@@ -1282,21 +1411,24 @@ class GenericIE(InfoExtractor):
request.add_header('Accept-Encoding', '*')
full_response = self._request_webpage(request, video_id)
+ first_bytes = full_response.read(512)
+
+ # Is it an M3U playlist?
+ if first_bytes.startswith(b'#EXTM3U'):
+ info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
+ self._sort_formats(info_dict['formats'])
+ return info_dict
+
# Maybe it's a direct link to a video?
# Be careful not to download the whole thing!
- first_bytes = full_response.read(512)
if not is_html(first_bytes):
self._downloader.report_warning(
'URL could be a direct video link, returning it as such.')
- upload_date = unified_strdate(
- head_response.headers.get('Last-Modified'))
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
+ info_dict.update({
'direct': True,
'url': url,
- 'upload_date': upload_date,
- }
+ })
+ return info_dict
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
@@ -1309,16 +1441,20 @@ class GenericIE(InfoExtractor):
if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc)
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
- return self._parse_smil(doc, url, video_id)
+ smil = self._parse_smil(doc, url, video_id)
+ self._sort_formats(smil['formats'])
+ return smil
elif doc.tag == '{http://xspf.org/ns/0/}playlist':
return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
- return {
- 'id': video_id,
- 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
- 'formats': self._parse_mpd_formats(
- doc, video_id, mpd_base_url=url.rpartition('/')[0]),
- }
+ info_dict['formats'] = self._parse_mpd_formats(
+ doc, video_id, mpd_base_url=url.rpartition('/')[0])
+ self._sort_formats(info_dict['formats'])
+ return info_dict
+ elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
+ info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id)
+ self._sort_formats(info_dict['formats'])
+ return info_dict
except compat_xml_parse_error:
pass
@@ -1338,7 +1474,8 @@ class GenericIE(InfoExtractor):
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
- video_title = self._html_search_regex(
+ video_title = self._og_search_title(
+ webpage, default=None) or self._html_search_regex(
r'(?s)<title>(.*?)</title>', webpage, 'video title',
default='video')
@@ -1356,6 +1493,9 @@ class GenericIE(InfoExtractor):
video_uploader = self._search_regex(
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
+ video_description = self._og_search_description(webpage, default=None)
+ video_thumbnail = self._og_search_thumbnail(webpage, default=None)
+
# Helper method
def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
@@ -1386,6 +1526,16 @@ class GenericIE(InfoExtractor):
if bc_urls:
return _playlist_from_matches(bc_urls, ie='BrightcoveNew')
+ # Look for ThePlatform embeds
+ tp_urls = ThePlatformIE._extract_urls(webpage)
+ if tp_urls:
+ return _playlist_from_matches(tp_urls, ie='ThePlatform')
+
+ # Look for Vessel embeds
+ vessel_urls = VesselIE._extract_urls(webpage)
+ if vessel_urls:
+ return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key())
+
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
@@ -1454,21 +1604,26 @@ class GenericIE(InfoExtractor):
'url': embed_url,
'ie_key': 'Wistia',
'uploader': video_uploader,
- 'title': video_title,
- 'id': video_id,
}
match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
'_type': 'url_transparent',
- 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
+ 'url': 'wistia:%s' % match.group('id'),
'ie_key': 'Wistia',
'uploader': video_uploader,
- 'title': video_title,
- 'id': match.group('id')
}
+ match = re.search(
+ r'''(?sx)
+ <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+ <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+ ''', webpage)
+ if match:
+ return self.url_result(self._proto_relative_url(
+ 'wistia:%s' % match.group('id')), 'Wistia')
+
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
@@ -1633,6 +1788,11 @@ class GenericIE(InfoExtractor):
if xhamster_urls:
return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
+ # Look for embedded TNAFlixNetwork player
+ tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)
+ if tnaflix_urls:
+ return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key())
+
# Look for embedded Tvigle player
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
@@ -1653,7 +1813,7 @@ class GenericIE(InfoExtractor):
# Look for embedded arte.tv player
mobj = re.search(
- r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
+ r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
@@ -1683,14 +1843,6 @@ class GenericIE(InfoExtractor):
url = unescapeHTML(mobj.group('url'))
return self.url_result(url)
- # Look for embedded vulture.com player
- mobj = re.search(
- r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
- webpage)
- if mobj is not None:
- url = unescapeHTML(mobj.group('url'))
- return self.url_result(url, ie='Vulture')
-
# Look for embedded mtvservices player
mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
if mtvservices_url:
@@ -1739,7 +1891,7 @@ class GenericIE(InfoExtractor):
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
mobj = re.search(
- r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+ r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Livestream')
@@ -1751,7 +1903,7 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P<id>[^']+)',", webpage) or
+ mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or
re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
if mobj is not None:
return self.url_result(smuggle_url(
@@ -1803,6 +1955,12 @@ class GenericIE(InfoExtractor):
if nbc_sports_url:
return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+ # Look for NBC News embeds
+ nbc_news_embed_url = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage)
+ if nbc_news_embed_url:
+ return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews')
+
# Look for Google Drive embeds
google_drive_url = GoogleDriveIE._extract_url(webpage)
if google_drive_url:
@@ -1830,10 +1988,10 @@ class GenericIE(InfoExtractor):
if onionstudios_url:
return self.url_result(onionstudios_url)
- # Look for SnagFilms embeds
- snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage)
- if snagfilms_url:
- return self.url_result(snagfilms_url)
+ # Look for ViewLift embeds
+ viewlift_url = ViewLiftEmbedIE._extract_url(webpage)
+ if viewlift_url:
+ return self.url_result(viewlift_url)
# Look for JWPlatform embeds
jwplatform_url = JWPlatformIE._extract_url(webpage)
@@ -1870,6 +2028,38 @@ class GenericIE(InfoExtractor):
self._proto_relative_url(unescapeHTML(mobj.group(1))),
'AdobeTVVideo')
+ # Look for Vine embeds
+ mobj = re.search(
+ r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))',
+ webpage)
+ if mobj is not None:
+ return self.url_result(
+ self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
+
+ # Look for Instagram embeds
+ instagram_embed_url = InstagramIE._extract_embed_url(webpage)
+ if instagram_embed_url is not None:
+ return self.url_result(
+ self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
+
+ # Look for LiveLeak embeds
+ liveleak_url = LiveLeakIE._extract_url(webpage)
+ if liveleak_url:
+ return self.url_result(liveleak_url, 'LiveLeak')
+
+ # Look for 3Q SDN embeds
+ threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
+ if threeqsdn_url:
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': ThreeQSDNIE.ie_key(),
+ 'url': self._proto_relative_url(threeqsdn_url),
+ 'title': video_title,
+ 'description': video_description,
+ 'thumbnail': video_thumbnail,
+ 'uploader': video_uploader,
+ }
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
@@ -1950,7 +2140,8 @@ class GenericIE(InfoExtractor):
raise UnsupportedError(url)
entries = []
- for video_url in found:
+ for video_url in orderedSet(found):
+ video_url = unescapeHTML(video_url)
video_url = video_url.replace('\\/', '/')
video_url = compat_urlparse.urljoin(url, video_url)
video_id = compat_urllib_parse_unquote(os.path.basename(video_url))
@@ -1979,9 +2170,14 @@ class GenericIE(InfoExtractor):
entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
elif ext == 'mpd':
entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id)
+ elif ext == 'f4m':
+ entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)
else:
entry_info_dict['url'] = video_url
+ if entry_info_dict.get('formats'):
+ self._sort_formats(entry_info_dict['formats'])
+
entries.append(entry_info_dict)
if len(entries) == 1:
diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py
index 9561ed5..62ff848 100644
--- a/youtube_dl/extractor/glide.py
+++ b/youtube_dl/extractor/glide.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import unified_strdate
class GlideIE(InfoExtractor):
@@ -15,26 +16,38 @@ class GlideIE(InfoExtractor):
'ext': 'mp4',
'title': 'Damon Timm\'s Glide message',
'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$',
+ 'uploader': 'Damon Timm',
+ 'upload_date': '20140919',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
+
title = self._html_search_regex(
- r'<title>(.*?)</title>', webpage, 'title')
- video_url = self.http_scheme() + self._search_regex(
- r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL')
- thumbnail_url = self._search_regex(
- r'<img id="video-thumbnail" src="(.*?)"',
- webpage, 'thumbnail url', fatal=False)
- thumbnail = (
- thumbnail_url if thumbnail_url is None
- else self.http_scheme() + thumbnail_url)
+ r'<title>(.+?)</title>', webpage, 'title')
+ video_url = self._proto_relative_url(self._search_regex(
+ r'<source[^>]+src=(["\'])(?P<url>.+?)\1',
+ webpage, 'video URL', default=None,
+ group='url')) or self._og_search_video_url(webpage)
+ thumbnail = self._proto_relative_url(self._search_regex(
+ r'<img[^>]+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P<url>.+?)\1',
+ webpage, 'thumbnail url', default=None,
+ group='url')) or self._og_search_thumbnail(webpage)
+ uploader = self._search_regex(
+ r'<div[^>]+class=["\']info-name["\'][^>]*>([^<]+)',
+ webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._search_regex(
+ r'<div[^>]+class="info-date"[^>]*>([^<]+)',
+ webpage, 'upload date', fatal=False))
return {
'id': video_id,
'title': title,
'url': video_url,
'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
}
diff --git a/youtube_dl/extractor/godtv.py b/youtube_dl/extractor/godtv.py
new file mode 100644
index 0000000..c5d3b4e
--- /dev/null
+++ b/youtube_dl/extractor/godtv.py
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import js_to_json
+
+
+class GodTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?god\.tv(?:/[^/]+)*/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://god.tv/jesus-image/video/jesus-conference-2016/randy-needham',
+ 'info_dict': {
+ 'id': 'lpd3g2MzE6D1g8zFAKz8AGpxWcpu6o_3',
+ 'ext': 'mp4',
+ 'title': 'Randy Needham',
+ 'duration': 3615.08,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'http://god.tv/playlist/bible-study',
+ 'info_dict': {
+ 'id': 'bible-study',
+ },
+ 'playlist_mincount': 37,
+ }, {
+ 'url': 'http://god.tv/node/15097',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://god.tv/live/africa',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://god.tv/liveevents',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ settings = self._parse_json(
+ self._search_regex(
+ r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+ webpage, 'settings', default='{}'),
+ display_id, transform_source=js_to_json, fatal=False)
+
+ ooyala_id = None
+
+ if settings:
+ playlist = settings.get('playlist')
+ if playlist and isinstance(playlist, list):
+ entries = [
+ OoyalaIE._build_url_result(video['content_id'])
+ for video in playlist if video.get('content_id')]
+ if entries:
+ return self.playlist_result(entries, display_id)
+ ooyala_id = settings.get('ooyala', {}).get('content_id')
+
+ if not ooyala_id:
+ ooyala_id = self._search_regex(
+ r'["\']content_id["\']\s*:\s*(["\'])(?P<id>[\w-]+)\1',
+ webpage, 'ooyala id', group='id')
+
+ return OoyalaIE._build_url_result(ooyala_id)
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py
index 37be340..766fc26 100644
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -10,8 +10,8 @@ from ..utils import (
class GoogleDriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})'
- _TEST = {
+ _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+ _TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
'md5': '881f7700aec4f538571fa1e0eed4a7b6',
'info_dict': {
@@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):
'title': 'Big Buck Bunny.mp4',
'duration': 46,
}
- }
+ }, {
+ # video id is longer than 28 characters
+ 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+ 'only_matching': True,
+ }]
_FORMATS_EXT = {
'5': 'flv',
'6': 'flv',
@@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
+ r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
webpage)
if mobj:
return 'https://drive.google.com/file/d/%s' % mobj.group('id')
diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py
index 1d91664..0c01514 100644
--- a/youtube_dl/extractor/goshgay.py
+++ b/youtube_dl/extractor/goshgay.py
@@ -14,13 +14,13 @@ class GoshgayIE(InfoExtractor):
_VALID_URL = r'https?://www\.goshgay\.com/video(?P<id>\d+?)($|/)'
_TEST = {
'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video',
- 'md5': '027fcc54459dff0feb0bc06a7aeda680',
+ 'md5': '4b6db9a0a333142eb9f15913142b0ed1',
'info_dict': {
'id': '299069',
'ext': 'flv',
'title': 'DIESEL SFW XXX Video',
'thumbnail': 're:^http://.*\.jpg$',
- 'duration': 79,
+ 'duration': 80,
'age_limit': 18,
}
}
@@ -47,5 +47,5 @@ class GoshgayIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'duration': duration,
- 'age_limit': self._family_friendly_search(webpage),
+ 'age_limit': 18,
}
diff --git a/youtube_dl/extractor/gputechconf.py b/youtube_dl/extractor/gputechconf.py
index 145b55b..73dc62c 100644
--- a/youtube_dl/extractor/gputechconf.py
+++ b/youtube_dl/extractor/gputechconf.py
@@ -2,12 +2,6 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- xpath_element,
- xpath_text,
- int_or_none,
- parse_duration,
-)
class GPUTechConfIE(InfoExtractor):
@@ -27,29 +21,15 @@ class GPUTechConfIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- root_path = self._search_regex(r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path', 'http://evt.dispeak.com/nvidia/events/gtc15/')
- xml_file_id = self._search_regex(r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id')
-
- doc = self._download_xml('%sxml/%s.xml' % (root_path, xml_file_id), video_id)
-
- metadata = xpath_element(doc, 'metadata')
- http_host = xpath_text(metadata, 'httpHost', 'http host', True)
- mbr_videos = xpath_element(metadata, 'MBRVideos')
-
- formats = []
- for mbr_video in mbr_videos.findall('MBRVideo'):
- stream_name = xpath_text(mbr_video, 'streamName')
- if stream_name:
- formats.append({
- 'url': 'http://%s/%s' % (http_host, stream_name.replace('mp4:', '')),
- 'tbr': int_or_none(xpath_text(mbr_video, 'bitrate')),
- })
- self._sort_formats(formats)
+ root_path = self._search_regex(
+ r'var\s+rootPath\s*=\s*"([^"]+)', webpage, 'root path',
+ default='http://evt.dispeak.com/nvidia/events/gtc15/')
+ xml_file_id = self._search_regex(
+ r'var\s+xmlFileId\s*=\s*"([^"]+)', webpage, 'xml file id')
return {
+ '_type': 'url_transparent',
'id': video_id,
- 'title': xpath_text(metadata, 'title'),
- 'duration': parse_duration(xpath_text(metadata, 'endTime')),
- 'creator': xpath_text(metadata, 'speaker'),
- 'formats': formats,
+ 'url': '%sxml/%s.xml' % (root_path, xml_file_id),
+ 'ie_key': 'DigitallySpeaking',
}
diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py
index 63c05b6..a6da909 100644
--- a/youtube_dl/extractor/groupon.py
+++ b/youtube_dl/extractor/groupon.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class GrouponIE(InfoExtractor):
- _VALID_URL = r'https?://www\.groupon\.com/deals/(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?groupon\.com/deals/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.groupon.com/deals/bikram-yoga-huntington-beach-2#ooid=tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
@@ -14,17 +14,27 @@ class GrouponIE(InfoExtractor):
'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
},
'playlist': [{
+ 'md5': '42428ce8a00585f9bc36e49226eae7a1',
'info_dict': {
- 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf',
+ 'id': 'fk6OhWpXgIQ',
'ext': 'mp4',
- 'title': 'Bikram Yoga Huntington Beach | Orange County',
+ 'title': 'Bikram Yoga Huntington Beach | Orange County !tubGNycTo@9Uxg82uESj4i61EYX8nyuf',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
- 'duration': 44.961,
+ 'duration': 45,
+ 'upload_date': '20160405',
+ 'uploader_id': 'groupon',
+ 'uploader': 'Groupon',
},
+ 'add_ie': ['Youtube'],
}],
'params': {
- 'skip_download': 'HLS',
- }
+ 'skip_download': True,
+ },
+ }
+
+ _PROVIDERS = {
+ 'ooyala': ('ooyala:%s', 'Ooyala'),
+ 'youtube': ('%s', 'Youtube'),
}
def _real_extract(self, url):
@@ -32,16 +42,21 @@ class GrouponIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
payload = self._parse_json(self._search_regex(
- r'var\s+payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id)
+ r'(?:var\s+|window\.)payload\s*=\s*(.*?);\n', webpage, 'payload'), playlist_id)
videos = payload['carousel'].get('dealVideos', [])
entries = []
for v in videos:
- if v.get('provider') != 'OOYALA':
+ provider = v.get('provider')
+ video_id = v.get('media') or v.get('id') or v.get('baseURL')
+ if not provider or not video_id:
+ continue
+ url_pattern, ie_key = self._PROVIDERS.get(provider.lower())
+ if not url_pattern:
self.report_warning(
'%s: Unsupported video provider %s, skipping video' %
- (playlist_id, v.get('provider')))
+ (playlist_id, provider))
continue
- entries.append(self.url_result('ooyala:%s' % v['media']))
+ entries.append(self.url_result(url_pattern % video_id, ie_key))
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py
new file mode 100644
index 0000000..dad0f39
--- /dev/null
+++ b/youtube_dl/extractor/hbo.py
@@ -0,0 +1,122 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ xpath_text,
+ xpath_element,
+ int_or_none,
+ parse_duration,
+)
+
+
+class HBOIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839',
+ 'md5': '1c33253f0c7782142c993c0ba62a8753',
+ 'info_dict': {
+ 'id': '1437839',
+ 'ext': 'mp4',
+ 'title': 'Ep. 64 Clip: Encryption',
+ }
+ }
+ _FORMATS_INFO = {
+ '1920': {
+ 'width': 1280,
+ 'height': 720,
+ },
+ '640': {
+ 'width': 768,
+ 'height': 432,
+ },
+ 'highwifi': {
+ 'width': 640,
+ 'height': 360,
+ },
+ 'high3g': {
+ 'width': 640,
+ 'height': 360,
+ },
+ 'medwifi': {
+ 'width': 400,
+ 'height': 224,
+ },
+ 'med3g': {
+ 'width': 400,
+ 'height': 224,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_data = self._download_xml(
+ 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id)
+ title = xpath_text(video_data, 'title', 'title', True)
+
+ formats = []
+ for source in xpath_element(video_data, 'videos', 'sources', True):
+ if source.tag == 'size':
+ path = xpath_text(source, './/path')
+ if not path:
+ continue
+ width = source.attrib.get('width')
+ format_info = self._FORMATS_INFO.get(width, {})
+ height = format_info.get('height')
+ fmt = {
+ 'url': path,
+ 'format_id': 'http%s' % ('-%dp' % height if height else ''),
+ 'width': format_info.get('width'),
+ 'height': height,
+ }
+ rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path)
+ if rtmp:
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': fmt['format_id'].replace('http', 'rtmp'),
+ })
+ formats.append(fmt)
+ else:
+ video_url = source.text
+ if not video_url:
+ continue
+ if source.tag == 'tarball':
+ formats.extend(self._extract_m3u8_formats(
+ video_url.replace('.tar', '/base_index_w8.m3u8'),
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ format_info = self._FORMATS_INFO.get(source.tag, {})
+ formats.append({
+ 'format_id': 'http-%s' % source.tag,
+ 'url': video_url,
+ 'width': format_info.get('width'),
+ 'height': format_info.get('height'),
+ })
+ self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
+
+ thumbnails = []
+ card_sizes = xpath_element(video_data, 'titleCardSizes')
+ if card_sizes is not None:
+ for size in card_sizes:
+ path = xpath_text(size, 'path')
+ if not path:
+ continue
+ width = int_or_none(size.get('width'))
+ thumbnails.append({
+ 'id': width,
+ 'url': path,
+ 'width': width,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')),
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ }
diff --git a/youtube_dl/extractor/hearthisat.py b/youtube_dl/extractor/hearthisat.py
index 7d86986..2564538 100644
--- a/youtube_dl/extractor/hearthisat.py
+++ b/youtube_dl/extractor/hearthisat.py
@@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
HEADRequest,
+ KNOWN_EXTENSIONS,
sanitized_Request,
str_to_int,
urlencode_postdata,
@@ -17,7 +18,7 @@ from ..utils import (
class HearThisAtIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hearthis\.at/(?P<artist>[^/]+)/(?P<title>[A-Za-z0-9\-]+)/?$'
_PLAYLIST_URL = 'https://hearthis.at/playlist.php'
- _TEST = {
+ _TESTS = [{
'url': 'https://hearthis.at/moofi/dr-kreep',
'md5': 'ab6ec33c8fed6556029337c7885eb4e0',
'info_dict': {
@@ -26,7 +27,7 @@ class HearThisAtIE(InfoExtractor):
'title': 'Moofi - Dr. Kreep',
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1421564134,
- 'description': 'Creepy Patch. Mutable Instruments Braids Vowel + Formant Mode.',
+ 'description': 'Listen to Dr. Kreep by Moofi on hearthis.at - Modular, Eurorack, Mutable Intruments Braids, Valhalla-DSP',
'upload_date': '20150118',
'comment_count': int,
'view_count': int,
@@ -34,7 +35,25 @@ class HearThisAtIE(InfoExtractor):
'duration': 71,
'categories': ['Experimental'],
}
- }
+ }, {
+ # 'download' link redirects to the original webpage
+ 'url': 'https://hearthis.at/twitchsf/dj-jim-hopkins-totally-bitchin-80s-dance-mix/',
+ 'md5': '5980ceb7c461605d30f1f039df160c6e',
+ 'info_dict': {
+ 'id': '811296',
+ 'ext': 'mp3',
+ 'title': 'TwitchSF - DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix!',
+ 'description': 'Listen to DJ Jim Hopkins - Totally Bitchin\' 80\'s Dance Mix! by TwitchSF on hearthis.at - Dance',
+ 'upload_date': '20160328',
+ 'timestamp': 1459186146,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ 'duration': 4360,
+ 'categories': ['Dance'],
+ },
+ }]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
@@ -90,13 +109,14 @@ class HearThisAtIE(InfoExtractor):
ext_handle = self._request_webpage(
ext_req, display_id, note='Determining extension')
ext = urlhandle_detect_ext(ext_handle)
- formats.append({
- 'format_id': 'download',
- 'vcodec': 'none',
- 'ext': ext,
- 'url': download_url,
- 'preference': 2, # Usually better quality
- })
+ if ext in KNOWN_EXTENSIONS:
+ formats.append({
+ 'format_id': 'download',
+ 'vcodec': 'none',
+ 'ext': ext,
+ 'url': download_url,
+ 'preference': 2, # Usually better quality
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py
index 31e2199..9db5652 100644
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -3,16 +3,16 @@ from __future__ import unicode_literals
import base64
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
HEADRequest,
sanitized_Request,
+ urlencode_postdata,
)
class HotNewHipHopIE(InfoExtractor):
- _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html'
+ _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html'
_TEST = {
'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html',
'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96',
@@ -35,7 +35,7 @@ class HotNewHipHopIE(InfoExtractor):
r'"contentUrl" content="(.*?)"', webpage, 'content URL')
return self.url_result(video_url, ie='Youtube')
- reqdata = compat_urllib_parse.urlencode([
+ reqdata = urlencode_postdata([
('mediaType', 's'),
('mediaId', video_id),
])
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py
index e8f51e5..7e36b85 100644
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
_TEST = {
'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
- 'md5': '8b743df908c42f60cf6496586c7f12c3',
+ 'md5': '7d45932269a288149483144f01b99789',
'info_dict': {
'id': '390161',
'ext': 'mp4',
@@ -19,9 +19,9 @@ class HowcastIE(InfoExtractor):
'duration': 56.823,
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'add_ie': ['Ooyala'],
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py
index 663e663..65ba2a4 100644
--- a/youtube_dl/extractor/howstuffworks.py
+++ b/youtube_dl/extractor/howstuffworks.py
@@ -6,6 +6,7 @@ from ..utils import (
int_or_none,
js_to_json,
unescapeHTML,
+ determine_ext,
)
@@ -23,6 +24,7 @@ class HowStuffWorksIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 161,
},
+ 'skip': 'Video broken',
},
{
'url': 'http://adventure.howstuffworks.com/7199-survival-zone-food-and-water-in-the-savanna-video.htm',
@@ -39,7 +41,7 @@ class HowStuffWorksIE(InfoExtractor):
'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm',
'info_dict': {
'id': '440011',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Sword Swallowing #1 by Dan Meyer',
'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International <www.swordswallow.org>',
'display_id': 'sword-swallowing-1-by-dan-meyer',
@@ -63,13 +65,19 @@ class HowStuffWorksIE(InfoExtractor):
video_id = clip_info['content_id']
formats = []
m3u8_url = clip_info.get('m3u8')
- if m3u8_url:
- formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ if m3u8_url and determine_ext(m3u8_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True))
+ flv_url = clip_info.get('flv_url')
+ if flv_url:
+ formats.append({
+ 'url': flv_url,
+ 'format_id': 'flv',
+ })
for video in clip_info.get('mp4', []):
formats.append({
'url': video['src'],
- 'format_id': video['bitrate'],
- 'vbr': int(video['bitrate'].rstrip('k')),
+ 'format_id': 'mp4-%s' % video['bitrate'],
+ 'vbr': int_or_none(video['bitrate'].rstrip('k')),
})
if not formats:
@@ -102,6 +110,6 @@ class HowStuffWorksIE(InfoExtractor):
'title': unescapeHTML(clip_info['clip_title']),
'description': unescapeHTML(clip_info.get('caption')),
'thumbnail': clip_info.get('video_still_url'),
- 'duration': clip_info.get('duration'),
+ 'duration': int_or_none(clip_info.get('duration')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py
index a38eae4..0590737 100644
--- a/youtube_dl/extractor/huffpost.py
+++ b/youtube_dl/extractor/huffpost.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
parse_duration,
unified_strdate,
)
@@ -29,7 +30,12 @@ class HuffPostIE(InfoExtractor):
'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ',
'duration': 1549,
'upload_date': '20140124',
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['HTTP Error 404: Not Found'],
}
def _real_extract(self, url):
@@ -45,7 +51,7 @@ class HuffPostIE(InfoExtractor):
description = data.get('description')
thumbnails = []
- for url in data['images'].values():
+ for url in filter(None, data['images'].values()):
m = re.match('.*-([0-9]+x[0-9]+)\.', url)
if not m:
continue
@@ -54,13 +60,25 @@ class HuffPostIE(InfoExtractor):
'resolution': m.group(1),
})
- formats = [{
- 'format': key,
- 'format_id': key.replace('/', '.'),
- 'ext': 'mp4',
- 'url': url,
- 'vcodec': 'none' if key.startswith('audio/') else None,
- } for key, url in data.get('sources', {}).get('live', {}).items()]
+ formats = []
+ sources = data.get('sources', {})
+ live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items())
+ for key, url in live_sources:
+ ext = determine_ext(url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False))
+ else:
+ formats.append({
+ 'format': key,
+ 'format_id': key.replace('/', '.'),
+ 'ext': 'mp4',
+ 'url': url,
+ 'vcodec': 'none' if key.startswith('audio/') else None,
+ })
if not formats and data.get('fivemin_id'):
return self.url_result('5min:%s' % data['fivemin_id'])
diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py
index b3706fe..f7c9130 100644
--- a/youtube_dl/extractor/hypem.py
+++ b/youtube_dl/extractor/hypem.py
@@ -4,7 +4,7 @@ import json
import time
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
sanitized_Request,
@@ -12,7 +12,7 @@ from ..utils import (
class HypemIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/'
+ _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/'
_TEST = {
'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME',
'md5': 'b9cc91b5af8995e9f0c1cee04c575828',
@@ -28,7 +28,7 @@ class HypemIE(InfoExtractor):
track_id = self._match_id(url)
data = {'ax': 1, 'ts': time.time()}
- request = sanitized_Request(url + '?' + compat_urllib_parse.urlencode(data))
+ request = sanitized_Request(url + '?' + compat_urllib_parse_urlencode(data))
response, urlh = self._download_webpage_handle(
request, track_id, 'Downloading webpage with the url')
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 02e1e42..0acce9f 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -1,10 +1,10 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
+ mimetype2ext,
qualities,
)
@@ -12,9 +12,9 @@ from ..utils import (
class ImdbIE(InfoExtractor):
IE_NAME = 'imdb'
IE_DESC = 'Internet Movie Database trailers'
- _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-)vi(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
'info_dict': {
'id': '2524815897',
@@ -22,7 +22,16 @@ class ImdbIE(InfoExtractor):
'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
}
- }
+ }, {
+ 'url': 'http://www.imdb.com/video/_/vi2524815897',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -42,19 +51,33 @@ class ImdbIE(InfoExtractor):
for f_url, f_name in extra_formats]
format_pages.append(player_page)
- quality = qualities(['SD', '480p', '720p'])
+ quality = qualities(('SD', '480p', '720p', '1080p'))
formats = []
for format_page in format_pages:
json_data = self._search_regex(
r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
format_page, 'json data', flags=re.DOTALL)
- info = json.loads(json_data)
- format_info = info['videoPlayerObject']['video']
- f_id = format_info['ffname']
+ info = self._parse_json(json_data, video_id, fatal=False)
+ if not info:
+ continue
+ format_info = info.get('videoPlayerObject', {}).get('video', {})
+ if not format_info:
+ continue
+ video_info_list = format_info.get('videoInfoList')
+ if not video_info_list or not isinstance(video_info_list, list):
+ continue
+ video_info = video_info_list[0]
+ if not video_info or not isinstance(video_info, dict):
+ continue
+ video_url = video_info.get('videoUrl')
+ if not video_url:
+ continue
+ format_id = format_info.get('ffname')
formats.append({
- 'format_id': f_id,
- 'url': format_info['videoInfoList'][0]['videoUrl'],
- 'quality': quality(f_id),
+ 'format_id': format_id,
+ 'url': video_url,
+ 'ext': mimetype2ext(video_info.get('videoMimeType')),
+ 'quality': quality(format_id),
})
self._sort_formats(formats)
@@ -70,7 +93,7 @@ class ImdbIE(InfoExtractor):
class ImdbListIE(InfoExtractor):
IE_NAME = 'imdb:list'
IE_DESC = 'Internet Movie Database lists'
- _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
+ _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
_TEST = {
'url': 'http://www.imdb.com/list/JFs9NWw6XI0',
'info_dict': {
diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py
index 12fb5e8..c6f0804 100644
--- a/youtube_dl/extractor/indavideo.py
+++ b/youtube_dl/extractor/indavideo.py
@@ -60,7 +60,8 @@ class IndavideoEmbedIE(InfoExtractor):
formats = [{
'url': video_url,
- 'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None),
+ 'height': int_or_none(self._search_regex(
+ r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)),
} for video_url in video_urls]
self._sort_formats(formats)
@@ -73,7 +74,7 @@ class IndavideoEmbedIE(InfoExtractor):
'url': self._proto_relative_url(thumbnail)
} for thumbnail in video.get('thumbnails', [])]
- tags = [tag['title'] for tag in video.get('tags', [])]
+ tags = [tag['title'] for tag in video.get('tags') or []]
return {
'id': video.get('id') or video_id,
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 016af20..cca0b8a 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -4,15 +4,12 @@ from __future__ import unicode_literals
import base64
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_parse_qs,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import determine_ext
+from .bokecc import BokeCCBaseIE
-class InfoQIE(InfoExtractor):
+class InfoQIE(BokeCCBaseIE):
_VALID_URL = r'https?://(?:www\.)?infoq\.com/(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
@@ -38,26 +35,6 @@ class InfoQIE(InfoExtractor):
},
}]
- def _extract_bokecc_videos(self, webpage, video_id):
- # TODO: bokecc.com is a Chinese video cloud platform
- # It should have an independent extractor but I don't have other
- # examples using bokecc
- player_params_str = self._html_search_regex(
- r'<script[^>]+src="http://p\.bokecc\.com/player\?([^"]+)',
- webpage, 'player params', default=None)
-
- player_params = compat_parse_qs(player_params_str)
-
- info_xml = self._download_xml(
- 'http://p.bokecc.com/servlet/playinfo?uid=%s&vid=%s&m=1' % (
- player_params['siteid'][0], player_params['vid'][0]), video_id)
-
- return [{
- 'format_id': 'bokecc',
- 'url': quality.find('./copy').attrib['playurl'],
- 'preference': int(quality.attrib['value']),
- } for quality in info_xml.findall('./video/quality')]
-
def _extract_rtmp_videos(self, webpage):
# The server URL is hardcoded
video_url = 'rtmpe://video.infoq.com/cfx/st/'
@@ -101,7 +78,7 @@ class InfoQIE(InfoExtractor):
if '/cn/' in url:
# for China videos, HTTP video URL exists but always fails with 403
- formats = self._extract_bokecc_videos(webpage, video_id)
+ formats = self._extract_bokecc_formats(webpage, video_id)
else:
formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage)
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index ed3e071..fc0197a 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -4,23 +4,32 @@ import re
from .common import InfoExtractor
from ..utils import (
+ get_element_by_attribute,
int_or_none,
limit_length,
+ lowercase_escape,
+ try_get,
)
class InstagramIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
'info_dict': {
'id': 'aye83DjauH',
'ext': 'mp4',
- 'uploader_id': 'naomipq',
'title': 'Video by naomipq',
'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
- }
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1371748545,
+ 'upload_date': '20130620',
+ 'uploader_id': 'naomipq',
+ 'uploader': 'Naomi Leonor Phan-Quang',
+ 'like_count': int,
+ 'comment_count': int,
+ },
}, {
# missing description
'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears',
@@ -29,6 +38,13 @@ class InstagramIE(InfoExtractor):
'ext': 'mp4',
'uploader_id': 'britneyspears',
'title': 'Video by britneyspears',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'timestamp': 1453760977,
+ 'upload_date': '20160125',
+ 'uploader_id': 'britneyspears',
+ 'uploader': 'Britney Spears',
+ 'like_count': int,
+ 'comment_count': int,
},
'params': {
'skip_download': True,
@@ -36,25 +52,86 @@ class InstagramIE(InfoExtractor):
}, {
'url': 'https://instagram.com/p/-Cmh1cukG2/',
'only_matching': True,
+ }, {
+ 'url': 'http://instagram.com/p/9o6LshA7zy/embed/',
+ 'only_matching': True,
}]
+ @staticmethod
+ def _extract_embed_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ blockquote_el = get_element_by_attribute(
+ 'class', 'instagram-media', webpage)
+ if blockquote_el is None:
+ return
+
+ mobj = re.search(
+ r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el)
+ if mobj:
+ return mobj.group('link')
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ url = mobj.group('url')
webpage = self._download_webpage(url, video_id)
- uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
- webpage, 'uploader id', fatal=False)
- desc = self._search_regex(
- r'"caption":"(.+?)"', webpage, 'description', default=None)
+
+ (video_url, description, thumbnail, timestamp, uploader,
+ uploader_id, like_count, comment_count) = [None] * 8
+
+ shared_data = self._parse_json(
+ self._search_regex(
+ r'window\._sharedData\s*=\s*({.+?});',
+ webpage, 'shared data', default='{}'),
+ video_id, fatal=False)
+ if shared_data:
+ media = try_get(
+ shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)
+ if media:
+ video_url = media.get('video_url')
+ description = media.get('caption')
+ thumbnail = media.get('display_src')
+ timestamp = int_or_none(media.get('date'))
+ uploader = media.get('owner', {}).get('full_name')
+ uploader_id = media.get('owner', {}).get('username')
+ like_count = int_or_none(media.get('likes', {}).get('count'))
+ comment_count = int_or_none(media.get('comments', {}).get('count'))
+
+ if not video_url:
+ video_url = self._og_search_video_url(webpage, secure=False)
+
+ if not uploader_id:
+ uploader_id = self._search_regex(
+ r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"',
+ webpage, 'uploader id', fatal=False)
+
+ if not description:
+ description = self._search_regex(
+ r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None)
+ if description is not None:
+ description = lowercase_escape(description)
+
+ if not thumbnail:
+ thumbnail = self._og_search_thumbnail(webpage)
return {
'id': video_id,
- 'url': self._og_search_video_url(webpage, secure=False),
+ 'url': video_url,
'ext': 'mp4',
'title': 'Video by %s' % uploader_id,
- 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
'uploader_id': uploader_id,
- 'description': desc,
+ 'uploader': uploader,
+ 'like_count': like_count,
+ 'comment_count': comment_count,
}
@@ -136,7 +213,7 @@ class InstagramUserIE(InfoExtractor):
if not page['items']:
break
- max_id = page['items'][-1]['id']
+ max_id = page['items'][-1]['id'].split('_')[0]
media_url = (
'http://instagram.com/%s/media?max_id=%s' % (
uploader_id, max_id))
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 483cc6f..45add00 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -1,93 +1,91 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import (
+ compat_parse_qs,
compat_urlparse,
- compat_urllib_parse,
)
from ..utils import (
- xpath_with_ns,
+ determine_ext,
+ int_or_none,
+ xpath_text,
)
class InternetVideoArchiveIE(InfoExtractor):
- _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?'
+ _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?'
_TEST = {
- 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247',
+ 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false',
'info_dict': {
- 'id': '452693',
+ 'id': '194487',
'ext': 'mp4',
- 'title': 'SKYFALL',
- 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.',
- 'duration': 152,
+ 'title': 'KICK-ASS 2',
+ 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
}
@staticmethod
- def _build_url(query):
- return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
+ def _build_json_url(query):
+ return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query
@staticmethod
- def _clean_query(query):
- NEEDED_ARGS = ['publishedid', 'customerid']
- query_dic = compat_urlparse.parse_qs(query)
- cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS)
- # Other player ids return m3u8 urls
- cleaned_dic['playerid'] = '247'
- cleaned_dic['videokbrate'] = '100000'
- return compat_urllib_parse.urlencode(cleaned_dic)
+ def _build_xml_url(query):
+ return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
def _real_extract(self, url):
query = compat_urlparse.urlparse(url).query
- query_dic = compat_urlparse.parse_qs(query)
+ query_dic = compat_parse_qs(query)
video_id = query_dic['publishedid'][0]
- url = self._build_url(query)
- flashconfiguration = self._download_xml(url, video_id,
- 'Downloading flash configuration')
- file_url = flashconfiguration.find('file').text
- file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
- # Replace some of the parameters in the query to get the best quality
- # and http links (no m3u8 manifests)
- file_url = re.sub(r'(?<=\?)(.+)$',
- lambda m: self._clean_query(m.group()),
- file_url)
- info = self._download_xml(file_url, video_id,
- 'Downloading video info')
- item = info.find('channel/item')
+ if '/player/' in url:
+ configuration = self._download_json(url, video_id)
+
+ # There are multiple videos in the playlist whlie only the first one
+ # matches the video played in browsers
+ video_info = configuration['playlist'][0]
+
+ formats = []
+ for source in video_info['sources']:
+ file_url = source['file']
+ if determine_ext(file_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ file_url, video_id, ext='mp4', m3u8_id='hls'))
+ else:
+ a_format = {
+ 'url': file_url,
+ }
+
+ if source.get('label') and source['label'][-4:] == ' kbs':
+ tbr = int_or_none(source['label'][:-4])
+ a_format.update({
+ 'tbr': tbr,
+ 'format_id': 'http-%d' % tbr,
+ })
+ formats.append(a_format)
- def _bp(p):
- return xpath_with_ns(
- p,
- {
- 'media': 'http://search.yahoo.com/mrss/',
- 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats',
- }
- )
- formats = []
- for content in item.findall(_bp('media:group/media:content')):
- attr = content.attrib
- f_url = attr['url']
- width = int(attr['width'])
- bitrate = int(attr['bitrate'])
- format_id = '%d-%dk' % (width, bitrate)
- formats.append({
- 'format_id': format_id,
- 'url': f_url,
- 'width': width,
- 'tbr': bitrate,
- })
+ self._sort_formats(formats)
- self._sort_formats(formats)
+ title = video_info['title']
+ description = video_info.get('description')
+ thumbnail = video_info.get('image')
+ else:
+ configuration = self._download_xml(url, video_id)
+ formats = [{
+ 'url': xpath_text(configuration, './file', 'file URL', fatal=True),
+ }]
+ thumbnail = xpath_text(configuration, './image', 'thumbnail')
+ title = 'InternetVideoArchive video %s' % video_id
+ description = None
return {
'id': video_id,
- 'title': item.find('title').text,
+ 'title': title,
'formats': formats,
- 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'],
- 'description': item.find('description').text,
- 'duration': int(attr['duration']),
+ 'thumbnail': thumbnail,
+ 'description': description,
}
diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py
index 61a0de4..788bbe0 100644
--- a/youtube_dl/extractor/iprima.py
+++ b/youtube_dl/extractor/iprima.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -6,6 +6,8 @@ import time
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
+ js_to_json,
sanitized_Request,
)
@@ -30,8 +32,7 @@ class IPrimaIE(InfoExtractor):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -43,9 +44,42 @@ class IPrimaIE(InfoExtractor):
req.add_header('Referer', url)
playerpage = self._download_webpage(req, video_id, note='Downloading player')
- m3u8_url = self._search_regex(r"'src': '([^']+\.m3u8)'", playerpage, 'm3u8 url')
+ formats = []
- formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+ def extract_formats(format_url, format_key=None, lang=None):
+ ext = determine_ext(format_url)
+ new_formats = []
+ if format_key == 'hls' or ext == 'm3u8':
+ new_formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ elif format_key == 'dash' or ext == 'mpd':
+ return
+ new_formats = self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False)
+ if lang:
+ for f in new_formats:
+ if not f.get('language'):
+ f['language'] = lang
+ formats.extend(new_formats)
+
+ options = self._parse_json(
+ self._search_regex(
+ r'(?s)var\s+playerOptions\s*=\s*({.+?});',
+ playerpage, 'player options', default='{}'),
+ video_id, transform_source=js_to_json, fatal=False)
+ if options:
+ for key, tracks in options.get('tracks', {}).items():
+ if not isinstance(tracks, list):
+ continue
+ for track in tracks:
+ src = track.get('src')
+ if src:
+ extract_formats(src, key.lower(), track.get('lang'))
+
+ if not formats:
+ for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage):
+ extract_formats(src)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index c3e3300..ddcb3c9 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -14,10 +14,11 @@ from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_str,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
)
from ..utils import (
+ decode_packed_codes,
ExtractorError,
ohdave_rsa_encrypt,
remove_start,
@@ -126,43 +127,11 @@ class IqiyiSDK(object):
class IqiyiSDKInterpreter(object):
- BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
def __init__(self, sdk_code):
self.sdk_code = sdk_code
- @classmethod
- def base62(cls, num):
- if num == 0:
- return '0'
- ret = ''
- while num:
- ret = cls.BASE62_TABLE[num % 62] + ret
- num = num // 62
- return ret
-
- def decode_eval_codes(self):
- self.sdk_code = self.sdk_code[5:-3]
-
- mobj = re.search(
- r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}",
- self.sdk_code)
- obfucasted_code, count, symbols = mobj.groups()
- count = int(count)
- symbols = symbols.split('|')
- symbol_table = {}
-
- while count:
- count -= 1
- b62count = self.base62(count)
- symbol_table[b62count] = symbols[count] or b62count
-
- self.sdk_code = re.sub(
- r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
- obfucasted_code)
-
def run(self, target, ip, timestamp):
- self.decode_eval_codes()
+ self.sdk_code = decode_packed_codes(self.sdk_code)
functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
@@ -196,7 +165,7 @@ class IqiyiIE(InfoExtractor):
IE_NAME = 'iqiyi'
IE_DESC = '爱奇艺'
- _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
+ _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html'
_NETRC_MACHINE = 'iqiyi'
@@ -304,6 +273,9 @@ class IqiyiIE(InfoExtractor):
'title': '灌篮高手 国语版',
},
'playlist_count': 101,
+ }, {
+ 'url': 'http://www.pps.tv/w_19rrbav0ph.html',
+ 'only_matching': True,
}]
_FORMATS_MAP = [
@@ -315,6 +287,13 @@ class IqiyiIE(InfoExtractor):
('10', 'h1'),
]
+ AUTH_API_ERRORS = {
+ # No preview available (不允许试看鉴权失败)
+ 'Q00505': 'This video requires a VIP account',
+ # End of preview time (试看结束鉴权失败)
+ 'Q00506': 'Needs a VIP account for full video',
+ }
+
def _real_initialize(self):
self._login()
@@ -353,7 +332,7 @@ class IqiyiIE(InfoExtractor):
'bird_t': timestamp,
}
validation_result = self._download_json(
- 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+ 'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None,
note='Validate credentials', errnote='Unable to validate credentials')
MSG_MAP = {
@@ -399,12 +378,19 @@ class IqiyiIE(InfoExtractor):
auth_req, video_id,
note='Downloading video authentication JSON',
errnote='Unable to download video authentication JSON')
- if auth_result['code'] == 'Q00506': # requires a VIP account
+
+ code = auth_result.get('code')
+ msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
+ if code == 'Q00506':
if do_report_warning:
- self.report_warning('Needs a VIP account for full video')
+ self.report_warning(msg)
return False
+ if 'data' not in auth_result:
+ if msg is not None:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
+ raise ExtractorError('Unexpected error from Iqiyi auth API')
- return auth_result
+ return auth_result['data']
def construct_video_urls(self, data, video_id, _uuid, tvid):
def do_xor(x, y):
@@ -480,14 +466,14 @@ class IqiyiIE(InfoExtractor):
need_vip_warning_report = False
break
param.update({
- 't': auth_result['data']['t'],
+ 't': auth_result['t'],
# cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
'cid': 'afbe8fd3d73448c9',
'vid': video_id,
- 'QY00001': auth_result['data']['u'],
+ 'QY00001': auth_result['u'],
})
api_video_url += '?' if '?' not in api_video_url else '&'
- api_video_url += compat_urllib_parse.urlencode(param)
+ api_video_url += compat_urllib_parse_urlencode(param)
js = self._download_json(
api_video_url, video_id,
note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
@@ -519,20 +505,23 @@ class IqiyiIE(InfoExtractor):
'enc': md5_text(enc_key + tail),
'qyid': _uuid,
'tn': random.random(),
- 'um': 0,
+ # In iQiyi's flash player, um is set to 1 if there's a logged user
+ # Some 1080P formats are only available with a logged user.
+ # Here force um=1 to trick the iQiyi server
+ 'um': 1,
'authkey': md5_text(md5_text('') + tail),
'k_tag': 1,
}
api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
- compat_urllib_parse.urlencode(param)
+ compat_urllib_parse_urlencode(param)
raw_data = self._download_json(api_url, video_id)
return raw_data
- def get_enc_key(self, swf_url, video_id):
+ def get_enc_key(self, video_id):
# TODO: automatic key extraction
# last update at 2016-01-22 for Zombie::bite
- enc_key = '6ab6d0280511493ba85594779759d4ed'
+ enc_key = '4a1caba4b4465345366f28da7c117d20'
return enc_key
def _extract_playlist(self, webpage):
@@ -582,11 +571,9 @@ class IqiyiIE(InfoExtractor):
r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
video_id = self._search_regex(
r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
- swf_url = self._search_regex(
- r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
_uuid = uuid.uuid4().hex
- enc_key = self.get_enc_key(swf_url, video_id)
+ enc_key = self.get_enc_key(video_id)
raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
diff --git a/youtube_dl/extractor/ivideon.py b/youtube_dl/extractor/ivideon.py
index 617dc8c..3ca824f 100644
--- a/youtube_dl/extractor/ivideon.py
+++ b/youtube_dl/extractor/ivideon.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import qualities
@@ -62,7 +62,7 @@ class IvideonIE(InfoExtractor):
quality = qualities(self._QUALITIES)
formats = [{
- 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse.urlencode({
+ 'url': 'https://streaming.ivideon.com/flv/live?%s' % compat_urllib_parse_urlencode({
'server': server_id,
'camera': camera_id,
'sessionId': 'demo',
diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py
index bc226fa..aa0728a 100644
--- a/youtube_dl/extractor/izlesene.py
+++ b/youtube_dl/extractor/izlesene.py
@@ -29,7 +29,7 @@ class IzleseneIE(InfoExtractor):
'ext': 'mp4',
'title': 'Sevinçten Çıldırtan Doğum Günü Hediyesi',
'description': 'md5:253753e2655dde93f59f74b572454f6d',
- 'thumbnail': 're:^http://.*\.jpg',
+ 'thumbnail': 're:^https?://.*\.jpg',
'uploader_id': 'pelikzzle',
'timestamp': int,
'upload_date': '20140702',
@@ -44,8 +44,7 @@ class IzleseneIE(InfoExtractor):
'id': '17997',
'ext': 'mp4',
'title': 'Tarkan Dortmund 2006 Konseri',
- 'description': 'Tarkan Dortmund 2006 Konseri',
- 'thumbnail': 're:^http://.*\.jpg',
+ 'thumbnail': 're:^https://.*\.jpg',
'uploader_id': 'parlayankiz',
'timestamp': int,
'upload_date': '20061112',
@@ -62,7 +61,7 @@ class IzleseneIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
+ description = self._og_search_description(webpage, default=None)
thumbnail = self._proto_relative_url(
self._og_search_thumbnail(webpage), scheme='http:')
diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py
deleted file mode 100644
index 063e86d..0000000
--- a/youtube_dl/extractor/jadorecettepub.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# coding: utf-8
-
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from .youtube import YoutubeIE
-
-
-class JadoreCettePubIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html'
-
- _TEST = {
- 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html',
- 'md5': '401286a06067c70b44076044b66515de',
- 'info_dict': {
- 'id': 'jLMja3tr7a4',
- 'ext': 'mp4',
- 'title': 'La pire utilisation de Star Wars',
- 'description': "Jadorecettepub.com vous a gratifié de plusieurs pubs géniales utilisant Star Wars et Dark Vador plus particulièrement... Mais l'heure est venue de vous proposer une version totalement massacrée, venue du Japon. Quand les Japonais détruisent l'image de Star Wars pour vendre du thon en boite, ça promet...",
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('id')
-
- webpage = self._download_webpage(url, display_id)
-
- title = self._html_search_regex(
- r'<span style="font-size: x-large;"><b>(.*?)</b></span>',
- webpage, 'title')
- description = self._html_search_regex(
- r'(?s)<div id="fb-root">(.*?)<script>', webpage, 'description',
- fatal=False)
- real_url = self._search_regex(
- r'\[/postlink\](.*)endofvid', webpage, 'video URL')
- video_id = YoutubeIE.extract_id(real_url)
-
- return {
- '_type': 'url_transparent',
- 'url': real_url,
- 'id': video_id,
- 'title': title,
- 'description': description,
- }
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py
index eef7daa..1a4227f 100644
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -8,7 +8,7 @@ from .common import InfoExtractor
class JeuxVideoIE(InfoExtractor):
- _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
+ _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm'
_TESTS = [{
'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm',
@@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor):
webpage = self._download_webpage(url, title)
title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)
config_url = self._html_search_regex(
- r'data-src="(/contenu/medias/video.php.*?)"',
+ r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"',
webpage, 'config URL')
config_url = 'http://www.jeuxvideo.com' + config_url
diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py
index 8e90d59..e44e311 100644
--- a/youtube_dl/extractor/jwplatform.py
+++ b/youtube_dl/extractor/jwplatform.py
@@ -4,67 +4,124 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+)
-class JWPlatformIE(InfoExtractor):
- _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
- _TEST = {
- 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
- 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
- 'info_dict': {
- 'id': 'nPripu9l',
- 'ext': 'mov',
- 'title': 'Big Buck Bunny Trailer',
- 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
- 'upload_date': '20081127',
- 'timestamp': 1227796140,
- }
- }
-
+class JWPlatformBaseIE(InfoExtractor):
@staticmethod
- def _extract_url(webpage):
+ def _find_jwplayer_data(webpage):
+ # TODO: Merge this with JWPlayer-related codes in generic.py
+
mobj = re.search(
- r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
+ 'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\((?P<options>[^)]+)\)',
webpage)
if mobj:
- return mobj.group('url')
+ return mobj.group('options')
- def _real_extract(self, url):
- video_id = self._match_id(url)
- json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
- video_data = json_data['playlist'][0]
- subtitles = {}
- for track in video_data['tracks']:
- if track['kind'] == 'captions':
- subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}]
+ def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
+ jwplayer_data = self._parse_json(
+ self._find_jwplayer_data(webpage), video_id)
+ return self._parse_jwplayer_data(
+ jwplayer_data, video_id, *args, **kwargs)
+
+ def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None):
+ # JWPlayer backward compatibility: flattened playlists
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+ if 'playlist' not in jwplayer_data:
+ jwplayer_data = {'playlist': [jwplayer_data]}
+
+ video_data = jwplayer_data['playlist'][0]
+
+ # JWPlayer backward compatibility: flattened sources
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
+ if 'sources' not in video_data:
+ video_data['sources'] = [video_data]
formats = []
for source in video_data['sources']:
source_url = self._proto_relative_url(source['file'])
source_type = source.get('type') or ''
- if source_type == 'application/vnd.apple.mpegurl':
+ if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- source_url, video_id, 'mp4', 'm3u8_native', fatal=False))
+ source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
elif source_type.startswith('audio'):
formats.append({
'url': source_url,
'vcodec': 'none',
})
else:
- formats.append({
+ a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
'height': int_or_none(source.get('height')),
- })
+ }
+ if source_url.startswith('rtmp'):
+ a_format['ext'] = 'flv',
+
+ # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+ # of jwplayer.flash.swf
+ rtmp_url_parts = re.split(
+ r'((?:mp4|mp3|flv):)', source_url, 1)
+ if len(rtmp_url_parts) == 3:
+ rtmp_url, prefix, play_path = rtmp_url_parts
+ a_format.update({
+ 'url': rtmp_url,
+ 'play_path': prefix + play_path,
+ })
+ if rtmp_params:
+ a_format.update(rtmp_params)
+ formats.append(a_format)
self._sort_formats(formats)
+ subtitles = {}
+ tracks = video_data.get('tracks')
+ if tracks and isinstance(tracks, list):
+ for track in tracks:
+ if track.get('file') and track.get('kind') == 'captions':
+ subtitles.setdefault(track.get('label') or 'en', []).append({
+ 'url': self._proto_relative_url(track['file'])
+ })
+
return {
'id': video_id,
- 'title': video_data['title'],
+ 'title': video_data['title'] if require_title else video_data.get('title'),
'description': video_data.get('description'),
'thumbnail': self._proto_relative_url(video_data.get('image')),
'timestamp': int_or_none(video_data.get('pubdate')),
+ 'duration': float_or_none(jwplayer_data.get('duration')),
'subtitles': subtitles,
'formats': formats,
}
+
+
+class JWPlatformIE(JWPlatformBaseIE):
+ _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+ _TEST = {
+ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
+ 'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
+ 'info_dict': {
+ 'id': 'nPripu9l',
+ 'ext': 'mov',
+ 'title': 'Big Buck Bunny Trailer',
+ 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.',
+ 'upload_date': '20081127',
+ 'timestamp': 1227796140,
+ }
+ }
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8})',
+ webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
+ return self._parse_jwplayer_data(json_data, video_id)
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index ccbc39c..a65697f 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -6,8 +6,9 @@ import base64
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
+ compat_parse_qs,
)
from ..utils import (
clean_html,
@@ -20,21 +21,17 @@ from ..utils import (
class KalturaIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
- kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)|
+ kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|
https?://
(:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/
(?:
(?:
# flash player
- index\.php/kwidget/
- (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/
- (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)|
+ index\.php/kwidget|
# html5 player
- html5/html5lib/
- (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+)
- .*\?.*\bwid=_(?P<partner_id_html5>\d+)
+ html5/html5lib/[^/]+/mwEmbedFrame\.php
)
- )
+ )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
)
'''
_API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
@@ -74,7 +71,7 @@ class KalturaIE(InfoExtractor):
for k, v in a.items():
params['%d:%s' % (i, k)] = v
- query = compat_urllib_parse.urlencode(params)
+ query = compat_urllib_parse_urlencode(params)
url = self._API_BASE + query
data = self._download_json(url, video_id, *args, **kwargs)
@@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
- partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5')
- entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')
-
- info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ partner_id, entry_id = mobj.group('partner_id', 'id')
+ ks = None
+ if partner_id and entry_id:
+ info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ else:
+ path, query = mobj.group('path', 'query')
+ if not path and not query:
+ raise ExtractorError('Invalid URL', expected=True)
+ params = {}
+ if query:
+ params = compat_parse_qs(query)
+ if path:
+ splitted_path = path.split('/')
+ params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]]))))
+ if 'wid' in params:
+ partner_id = params['wid'][0][1:]
+ elif 'p' in params:
+ partner_id = params['p'][0]
+ else:
+ raise ExtractorError('Invalid URL', expected=True)
+ if 'entry_id' in params:
+ entry_id = params['entry_id'][0]
+ info, flavor_assets = self._get_video_info(entry_id, partner_id)
+ elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
+ reference_id = params['flashvars[referenceId]'][0]
+ webpage = self._download_webpage(url, reference_id)
+ entry_data = self._parse_json(self._search_regex(
+ r'window\.kalturaIframePackageData\s*=\s*({.*});',
+ webpage, 'kalturaIframePackageData'),
+ reference_id)['entryResult']
+ info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
+ entry_id = info['id']
+ else:
+ raise ExtractorError('Invalid URL', expected=True)
+ ks = params.get('flashvars[ks]', [None])[0]
source_url = smuggled_data.get('source_url')
if source_url:
@@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):
else:
referrer = None
+ def sign_url(unsigned_url):
+ if ks:
+ unsigned_url += '/ks/%s' % ks
+ if referrer:
+ unsigned_url += '?referrer=%s' % referrer
+ return unsigned_url
+
formats = []
for f in flavor_assets:
# Continue if asset is not ready
if f['status'] != 2:
continue
- video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id'])
- if referrer:
- video_url += '?referrer=%s' % referrer
+ video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
formats.append({
'format_id': '%(fileExt)s-%(bitrate)s' % f,
'ext': f.get('fileExt'),
@@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):
'width': int_or_none(f.get('width')),
'url': video_url,
})
- m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp')
- if referrer:
- m3u8_url += '?referrer=%s' % referrer
+ m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
formats.extend(self._extract_m3u8_formats(
m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py
index 06daf5a..a6050c4 100644
--- a/youtube_dl/extractor/karaoketv.py
+++ b/youtube_dl/extractor/karaoketv.py
@@ -2,39 +2,63 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote_plus
-from ..utils import (
- js_to_json,
-)
class KaraoketvIE(InfoExtractor):
- _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)'
+ _VALID_URL = r'http://www.karaoketv.co.il/[^/]+/(?P<id>\d+)'
_TEST = {
- 'url': 'http://karaoketv.co.il/?container=songs&id=171568',
+ 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F',
'info_dict': {
- 'id': '171568',
- 'ext': 'mp4',
- 'title': 'אל העולם שלך - רותם כהן - שרים קריוקי',
+ 'id': '58356',
+ 'ext': 'flv',
+ 'title': 'קריוקי של איזון',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
+ api_page_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>https?://www\.karaoke\.co\.il/api_play\.php\?.+?)\1',
+ webpage, 'API play URL', group='url')
+
+ api_page = self._download_webpage(api_page_url, video_id)
+ video_cdn_url = self._search_regex(
+ r'<iframe[^>]+src=(["\'])(?P<url>https?://www\.video-cdn\.com/embed/iframe/.+?)\1',
+ api_page, 'video cdn URL', group='url')
+
+ video_cdn = self._download_webpage(video_cdn_url, video_id)
+ play_path = self._parse_json(
+ self._search_regex(
+ r'var\s+options\s*=\s*({.+?});', video_cdn, 'options'),
+ video_id)['clip']['url']
- page_video_url = self._og_search_video_url(webpage, video_id)
- config_json = compat_urllib_parse_unquote_plus(self._search_regex(
- r'config=(.*)', page_video_url, 'configuration'))
+ settings = self._parse_json(
+ self._search_regex(
+ r'var\s+settings\s*=\s*({.+?});', video_cdn, 'servers', default='{}'),
+ video_id, fatal=False) or {}
- urls_info_json = self._download_json(
- config_json, video_id, 'Downloading configuration',
- transform_source=js_to_json)
+ servers = settings.get('servers')
+ if not servers or not isinstance(servers, list):
+ servers = ('wowzail.video-cdn.com:80/vodcdn', )
- url = urls_info_json['playlist'][0]['url']
+ formats = [{
+ 'url': 'rtmp://%s' % server if not server.startswith('rtmp') else server,
+ 'play_path': play_path,
+ 'app': 'vodcdn',
+ 'page_url': video_cdn_url,
+ 'player_url': 'http://www.video-cdn.com/assets/flowplayer/flowplayer.commercial-3.2.18.swf',
+ 'rtmp_real_time': True,
+ 'ext': 'flv',
+ } for server in servers]
return {
'id': video_id,
'title': self._og_search_title(webpage),
- 'url': url,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
index bed94bc..c05263e 100644
--- a/youtube_dl/extractor/karrierevideos.py
+++ b/youtube_dl/extractor/karrierevideos.py
@@ -12,7 +12,7 @@ from ..utils import (
class KarriereVideosIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
'info_dict': {
@@ -52,9 +52,12 @@ class KarriereVideosIE(InfoExtractor):
video_id = self._search_regex(
r'/config/video/(.+?)\.xml', webpage, 'video id')
+ # Server returns malformed headers
+ # Force Accept-Encoding: * to prevent gzipped results
playlist = self._download_xml(
'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
- video_id, transform_source=fix_xml_ampersands)
+ video_id, transform_source=fix_xml_ampersands,
+ headers={'Accept-Encoding': '*'})
NS_MAP = {
'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 08a671f..61739ef 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.khanacademy.org/video/one-time-pad',
- 'md5': '7021db7f2d47d4fff89b13177cb1e8f4',
+ 'md5': '7b391cce85e758fb94f763ddc1bbb979',
'info_dict': {
'id': 'one-time-pad',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'The one-time pad',
'description': 'The perfect cipher',
'duration': 176,
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
index a59c529..704bd7b 100644
--- a/youtube_dl/extractor/kontrtube.py
+++ b/youtube_dl/extractor/kontrtube.py
@@ -13,7 +13,7 @@ from ..utils import (
class KontrTubeIE(InfoExtractor):
IE_NAME = 'kontrtube'
IE_DESC = 'KontrTube.ru - Труба зовёт'
- _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/'
+ _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/'
_TEST = {
'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py
index a602980..a574408 100644
--- a/youtube_dl/extractor/ku6.py
+++ b/youtube_dl/extractor/ku6.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class Ku6IE(InfoExtractor):
- _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html'
+ _VALID_URL = r'https?://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html'
_TEST = {
'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html',
'md5': '01203549b9efbb45f4b87d55bdea1ed1',
diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py
new file mode 100644
index 0000000..12cc56e
--- /dev/null
+++ b/youtube_dl/extractor/kusi.py
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import random
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote_plus
+from ..utils import (
+ int_or_none,
+ float_or_none,
+ timeconvert,
+ update_url_query,
+ xpath_text,
+)
+
+
+class KUSIIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))'
+ _TESTS = [{
+ 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold',
+ 'md5': 'f926e7684294cf8cb7bdf8858e1b3988',
+ 'info_dict': {
+ 'id': '12203019',
+ 'ext': 'mp4',
+ 'title': 'Turko Files: Case Closed! & Put On Hold!',
+ 'duration': 231.0,
+ 'upload_date': '20160210',
+ 'timestamp': 1455087571,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ }, {
+ 'url': 'http://kusi.com/video?clipId=12203019',
+ 'info_dict': {
+ 'id': '12203019',
+ 'ext': 'mp4',
+ 'title': 'Turko Files: Case Closed! & Put On Hold!',
+ 'duration': 231.0,
+ 'upload_date': '20160210',
+ 'timestamp': 1455087571,
+ 'thumbnail': 're:^https?://.*\.jpg$'
+ },
+ 'params': {
+ 'skip_download': True, # Same as previous one
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ clip_id = mobj.group('clipId')
+ video_id = clip_id or mobj.group('path')
+
+ webpage = self._download_webpage(url, video_id)
+
+ if clip_id is None:
+ video_id = clip_id = self._html_search_regex(
+ r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id')
+
+ affiliate_id = self._search_regex(
+ r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id')
+
+ # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf
+ xml_url = update_url_query('http://www.kusi.com/build.asp', {
+ 'buildtype': 'buildfeaturexmlrequest',
+ 'featureType': 'Clip',
+ 'featureid': clip_id,
+ 'affiliateno': affiliate_id,
+ 'clientgroupid': '1',
+ 'rnd': int(round(random.random() * 1000000)),
+ })
+
+ doc = self._download_xml(xml_url, video_id)
+
+ video_title = xpath_text(doc, 'HEADLINE', fatal=True)
+ duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
+ description = xpath_text(doc, 'ABSTRACT')
+ thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
+ createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
+
+ quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
+ formats = []
+ for quality in quality_options:
+ formats.append({
+ 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']),
+ 'height': int_or_none(quality.attrib.get('height')),
+ 'width': int_or_none(quality.attrib.get('width')),
+ 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'description': description,
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'timestamp': createtion_time,
+ }
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index f641ede..0221fb9 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -2,13 +2,13 @@
from __future__ import unicode_literals
import re
-import itertools
from .common import InfoExtractor
from ..utils import (
get_element_by_id,
clean_html,
ExtractorError,
+ InAdvancePagedList,
remove_start,
)
@@ -23,16 +23,29 @@ class KuwoBaseIE(InfoExtractor):
{'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}
]
- def _get_formats(self, song_id):
+ def _get_formats(self, song_id, tolerate_ip_deny=False):
formats = []
for file_format in self._FORMATS:
+ headers = {}
+ cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
+ if cn_verification_proxy:
+ headers['Ytdl-request-proxy'] = cn_verification_proxy
+
+ query = {
+ 'format': file_format['ext'],
+ 'br': file_format.get('br', ''),
+ 'rid': 'MUSIC_%s' % song_id,
+ 'type': 'convert_url',
+ 'response': 'url'
+ }
+
song_url = self._download_webpage(
- 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' %
- (file_format['ext'], file_format.get('br', ''), song_id),
+ 'http://antiserver.kuwo.cn/anti.s',
song_id, note='Download %s url info' % file_format['format'],
+ query=query, headers=headers,
)
- if song_url == 'IPDeny':
+ if song_url == 'IPDeny' and not tolerate_ip_deny:
raise ExtractorError('This song is blocked in this region', expected=True)
if song_url.startswith('http://') or song_url.startswith('https://'):
@@ -43,14 +56,14 @@ class KuwoBaseIE(InfoExtractor):
'preference': file_format['preference'],
'abr': file_format.get('abr'),
})
- self._sort_formats(formats)
+
return formats
class KuwoIE(KuwoBaseIE):
IE_NAME = 'kuwo:song'
IE_DESC = '酷我音乐'
- _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/'
+ _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.kuwo.cn/yinyue/635632/',
'info_dict': {
@@ -68,12 +81,16 @@ class KuwoIE(KuwoBaseIE):
'id': '6446136',
'ext': 'mp3',
'title': '心',
+ 'description': 'md5:5d0e947b242c35dc0eb1d2fce9fbf02c',
'creator': 'IU',
'upload_date': '20150518',
},
'params': {
'format': 'mp3-320'
},
+ }, {
+ 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -85,18 +102,19 @@ class KuwoIE(KuwoBaseIE):
raise ExtractorError('this song has been offline because of copyright issues', expected=True)
song_name = self._html_search_regex(
- r'(?s)class="(?:[^"\s]+\s+)*title(?:\s+[^"\s]+)*".*?<h1[^>]+title="([^"]+)"', webpage, 'song name')
- singer_name = self._html_search_regex(
- r'<div[^>]+class="s_img">\s*<a[^>]+title="([^>]+)"',
- webpage, 'singer name', fatal=False)
+ r'<p[^>]+id="lrcName">([^<]+)</p>', webpage, 'song name')
+ singer_name = remove_start(self._html_search_regex(
+ r'<a[^>]+href="http://www\.kuwo\.cn/artist/content\?name=([^"]+)">',
+ webpage, 'singer name', fatal=False), '歌手')
lrc_content = clean_html(get_element_by_id('lrcContent', webpage))
if lrc_content == '暂无': # indicates no lyrics
lrc_content = None
formats = self._get_formats(song_id)
+ self._sort_formats(formats)
album_id = self._html_search_regex(
- r'<p[^>]+class="album"[^<]+<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
+ r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"',
webpage, 'album id', fatal=False)
publish_time = None
@@ -125,13 +143,13 @@ class KuwoIE(KuwoBaseIE):
class KuwoAlbumIE(InfoExtractor):
IE_NAME = 'kuwo:album'
IE_DESC = '酷我音乐 - 专辑'
- _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/'
+ _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P<id>\d+?)/'
_TEST = {
'url': 'http://www.kuwo.cn/album/502294/',
'info_dict': {
'id': '502294',
- 'title': 'M',
- 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c',
+ 'title': 'Made\xa0Series\xa0《M》',
+ 'description': 'md5:d463f0d8a0ff3c3ea3d6ed7452a9483f',
},
'playlist_count': 2,
}
@@ -161,13 +179,11 @@ class KuwoAlbumIE(InfoExtractor):
class KuwoChartIE(InfoExtractor):
IE_NAME = 'kuwo:chart'
IE_DESC = '酷我音乐 - 排行榜'
- _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
+ _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm'
_TEST = {
'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm',
'info_dict': {
'id': '香港中文龙虎榜',
- 'title': '香港中文龙虎榜',
- 'description': 're:\d{4}第\d{2}期',
},
'playlist_mincount': 10,
}
@@ -178,30 +194,24 @@ class KuwoChartIE(InfoExtractor):
url, chart_id, note='Download chart info',
errnote='Unable to get chart info')
- chart_name = self._html_search_regex(
- r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name')
-
- chart_desc = self._html_search_regex(
- r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc')
-
entries = [
self.url_result(song_url, 'Kuwo') for song_url in re.findall(
- r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage)
+ r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage)
]
- return self.playlist_result(entries, chart_id, chart_name, chart_desc)
+ return self.playlist_result(entries, chart_id)
class KuwoSingerIE(InfoExtractor):
IE_NAME = 'kuwo:singer'
IE_DESC = '酷我音乐 - 歌手'
- _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.kuwo.cn/mingxing/bruno+mars/',
'info_dict': {
'id': 'bruno+mars',
- 'title': 'Bruno Mars',
+ 'title': 'Bruno\xa0Mars',
},
- 'playlist_count': 10,
+ 'playlist_mincount': 329,
}, {
'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm',
'info_dict': {
@@ -212,6 +222,8 @@ class KuwoSingerIE(InfoExtractor):
'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540
}]
+ PAGE_SIZE = 15
+
def _real_extract(self, url):
singer_id = self._match_id(url)
webpage = self._download_webpage(
@@ -219,25 +231,28 @@ class KuwoSingerIE(InfoExtractor):
errnote='Unable to get singer info')
singer_name = self._html_search_regex(
- r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name'
- )
+ r'<h1>([^<]+)</h1>', webpage, 'singer name')
- entries = []
- first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True
- for page_num in itertools.count(1):
+ artist_id = self._html_search_regex(
+ r'data-artistid="(\d+)"', webpage, 'artist id')
+
+ page_count = int(self._html_search_regex(
+ r'data-page="(\d+)"', webpage, 'page count'))
+
+ def page_func(page_num):
webpage = self._download_webpage(
- 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num),
- singer_id, note='Download song list page #%d' % page_num,
- errnote='Unable to get song list page #%d' % page_num)
+ 'http://www.kuwo.cn/artist/contentMusicsAjax',
+ singer_id, note='Download song list page #%d' % (page_num + 1),
+ errnote='Unable to get song list page #%d' % (page_num + 1),
+ query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE})
- entries.extend([
+ return [
self.url_result(song_url, 'Kuwo') for song_url in re.findall(
- r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/',
+ r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)',
webpage)
- ][:10 if first_page_only else None])
+ ]
- if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage):
- break
+ entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE)
return self.playlist_result(entries, singer_id, singer_name)
@@ -245,7 +260,7 @@ class KuwoSingerIE(InfoExtractor):
class KuwoCategoryIE(InfoExtractor):
IE_NAME = 'kuwo:category'
IE_DESC = '酷我音乐 - 分类'
- _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
+ _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm'
_TEST = {
'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm',
'info_dict': {
@@ -253,7 +268,7 @@ class KuwoCategoryIE(InfoExtractor):
'title': '八十年代精选',
'description': '这些都是属于八十年代的回忆!',
},
- 'playlist_count': 30,
+ 'playlist_mincount': 24,
}
def _real_extract(self, url):
@@ -268,6 +283,8 @@ class KuwoCategoryIE(InfoExtractor):
category_desc = remove_start(
get_element_by_id('intro', webpage).strip(),
'%s简介:' % category_name)
+ if category_desc == '暂无':
+ category_desc = None
jsonm = self._parse_json(self._html_search_regex(
r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id)
@@ -282,14 +299,20 @@ class KuwoCategoryIE(InfoExtractor):
class KuwoMvIE(KuwoBaseIE):
IE_NAME = 'kuwo:mv'
IE_DESC = '酷我音乐 - MV'
- _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/'
+ _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P<id>\d+?)/'
_TEST = {
'url': 'http://www.kuwo.cn/mv/6480076/',
'info_dict': {
'id': '6480076',
- 'ext': 'mkv',
- 'title': '我们家MV',
- 'creator': '2PM',
+ 'ext': 'mp4',
+ 'title': 'My HouseMV',
+ 'creator': 'PM02:00',
+ },
+ # In this video, music URLs (anti.s) are blocked outside China and
+ # USA, while the MV URL (mvurl) is available globally, so force the MV
+ # URL for consistent results in different countries
+ 'params': {
+ 'format': 'mv',
},
}
_FORMATS = KuwoBaseIE._FORMATS + [
@@ -312,7 +335,17 @@ class KuwoMvIE(KuwoBaseIE):
else:
raise ExtractorError('Unable to find song or singer names')
- formats = self._get_formats(song_id)
+ formats = self._get_formats(song_id, tolerate_ip_deny=True)
+
+ mv_url = self._download_webpage(
+ 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id,
+ song_id, note='Download %s MV URL' % song_id)
+ formats.append({
+ 'url': mv_url,
+ 'format_id': 'mv',
+ })
+
+ self._sort_formats(formats)
return {
'id': song_id,
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 5d8ebbe..2fab380 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -19,7 +19,7 @@ from ..utils import (
class Laola1TvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/[^/]+/(?P<slug>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/(?P<kind>[^/]+)/(?P<slug>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html',
'info_dict': {
@@ -33,7 +33,7 @@ class Laola1TvIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
}, {
'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie',
'info_dict': {
@@ -47,17 +47,37 @@ class Laola1TvIE(InfoExtractor):
},
'params': {
'skip_download': True,
- }
+ },
+ }, {
+ 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde',
+ 'info_dict': {
+ 'id': '487850',
+ 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde',
+ 'ext': 'flv',
+ 'title': 'Belogorie BELGOROD - TRENTINO Diatec',
+ 'upload_date': '20160322',
+ 'uploader': 'CEV - Europäischer Volleyball Verband',
+ 'is_live': True,
+ 'categories': ['Volleyball'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'This live stream has already finished.',
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('slug')
+ kind = mobj.group('kind')
lang = mobj.group('lang')
portal = mobj.group('portal')
webpage = self._download_webpage(url, display_id)
+ if 'Dieser Livestream ist bereits beendet.' in webpage:
+ raise ExtractorError('This live stream has already finished.', expected=True)
+
iframe_url = self._search_regex(
r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"',
webpage, 'iframe url')
@@ -74,7 +94,7 @@ class Laola1TvIE(InfoExtractor):
hd_doc = self._download_xml(
'http://www.laola1.tv/server/hd_video.php?%s'
- % compat_urllib_parse.urlencode({
+ % compat_urllib_parse_urlencode({
'play': video_id,
'partner': partner_id,
'portal': portal,
@@ -85,12 +105,17 @@ class Laola1TvIE(InfoExtractor):
_v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k)
title = _v('title', fatal=True)
+ VS_TARGETS = {
+ 'video': '2',
+ 'livestream': '17',
+ }
+
req = sanitized_Request(
'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' %
- compat_urllib_parse.urlencode({
+ compat_urllib_parse_urlencode({
'videoId': video_id,
- 'target': '2',
- 'label': 'laola1tv',
+ 'target': VS_TARGETS.get(kind, '2'),
+ 'label': _v('label'),
'area': _v('area'),
}),
urlencode_postdata(
@@ -109,6 +134,7 @@ class Laola1TvIE(InfoExtractor):
formats = self._extract_f4m_formats(
'%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth),
video_id, f4m_id='hds')
+ self._sort_formats(formats)
categories_str = _v('meta_sports')
categories = categories_str.split(',') if categories_str else []
diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py
new file mode 100644
index 0000000..1435e09
--- /dev/null
+++ b/youtube_dl/extractor/learnr.py
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LearnrIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript',
+ 'md5': '3719fdf0a68397f49899e82c308a89de',
+ 'info_dict': {
+ 'id': '51624',
+ 'ext': 'mp4',
+ 'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript',
+ 'description': 'md5:b36dbfa92350176cdf12b4d388485503',
+ 'uploader': 'LearnCode.academy',
+ 'uploader_id': 'learncodeacademy',
+ 'upload_date': '20131021',
+ },
+ 'add_ie': ['Youtube'],
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': self._search_regex(
+ r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'),
+ 'id': video_id,
+ }
diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py
index 40a3d23..81b5d41 100644
--- a/youtube_dl/extractor/lecture2go.py
+++ b/youtube_dl/extractor/lecture2go.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ determine_protocol,
parse_duration,
int_or_none,
)
@@ -18,10 +19,14 @@ class Lecture2GoIE(InfoExtractor):
'md5': 'ac02b570883020d208d405d5a3fd2f7f',
'info_dict': {
'id': '17473',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': '2 - Endliche Automaten und reguläre Sprachen',
'creator': 'Frank Heitmann',
'duration': 5220,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
}
}
@@ -32,14 +37,18 @@ class Lecture2GoIE(InfoExtractor):
title = self._html_search_regex(r'<em[^>]+class="title">(.+)</em>', webpage, 'title')
formats = []
- for url in set(re.findall(r'"src","([^"]+)"', webpage)):
+ for url in set(re.findall(r'var\s+playerUri\d+\s*=\s*"([^"]+)"', webpage)):
ext = determine_ext(url)
+ protocol = determine_protocol({'url': url})
if ext == 'f4m':
- formats.extend(self._extract_f4m_formats(url, video_id))
+ formats.extend(self._extract_f4m_formats(url, video_id, f4m_id='hds'))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(url, video_id))
+ formats.extend(self._extract_m3u8_formats(url, video_id, ext='mp4', m3u8_id='hls'))
else:
+ if protocol == 'rtmp':
+ continue # XXX: currently broken
formats.append({
+ 'format_id': protocol,
'url': url,
})
diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/leeco.py
index 9665ece..63f581c 100644
--- a/youtube_dl/extractor/letv.py
+++ b/youtube_dl/extractor/leeco.py
@@ -1,36 +1,39 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import datetime
+import hashlib
import re
import time
-import base64
-import hashlib
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_ord,
compat_str,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
determine_ext,
+ encode_data_uri,
ExtractorError,
+ int_or_none,
+ orderedSet,
parse_iso8601,
sanitized_Request,
- int_or_none,
str_or_none,
- encode_data_uri,
url_basename,
)
-class LetvIE(InfoExtractor):
+class LeIE(InfoExtractor):
IE_DESC = '乐视网'
- _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html'
+ _VALID_URL = r'https?://(?:www\.le\.com/ptv/vplay|sports\.le\.com/video)/(?P<id>\d+)\.html'
+
+ _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'
_TESTS = [{
- 'url': 'http://www.letv.com/ptv/vplay/22005890.html',
+ 'url': 'http://www.le.com/ptv/vplay/22005890.html',
'md5': 'edadcfe5406976f42f9f266057ee5e40',
'info_dict': {
'id': '22005890',
@@ -42,7 +45,7 @@ class LetvIE(InfoExtractor):
'hls_prefer_native': True,
},
}, {
- 'url': 'http://www.letv.com/ptv/vplay/1415246.html',
+ 'url': 'http://www.le.com/ptv/vplay/1415246.html',
'info_dict': {
'id': '1415246',
'ext': 'mp4',
@@ -54,7 +57,7 @@ class LetvIE(InfoExtractor):
},
}, {
'note': 'This video is available only in Mainland China, thus a proxy is needed',
- 'url': 'http://www.letv.com/ptv/vplay/1118082.html',
+ 'url': 'http://www.le.com/ptv/vplay/1118082.html',
'md5': '2424c74948a62e5f31988438979c5ad1',
'info_dict': {
'id': '1118082',
@@ -66,6 +69,9 @@ class LetvIE(InfoExtractor):
'hls_prefer_native': True,
},
'skip': 'Only available in China',
+ }, {
+ 'url': 'http://sports.le.com/video/25737697.html',
+ 'only_matching': True,
}]
@staticmethod
@@ -94,17 +100,16 @@ class LetvIE(InfoExtractor):
return encrypted_data
encrypted_data = encrypted_data[5:]
- _loc4_ = bytearray()
- while encrypted_data:
- b = compat_ord(encrypted_data[0])
- _loc4_.extend([b // 16, b & 0x0f])
- encrypted_data = encrypted_data[1:]
+ _loc4_ = bytearray(2 * len(encrypted_data))
+ for idx, val in enumerate(encrypted_data):
+ b = compat_ord(val)
+ _loc4_[2 * idx] = b // 16
+ _loc4_[2 * idx + 1] = b % 16
idx = len(_loc4_) - 11
_loc4_ = _loc4_[idx:] + _loc4_[:idx]
- _loc7_ = bytearray()
- while _loc4_:
- _loc7_.append(_loc4_[0] * 16 + _loc4_[1])
- _loc4_ = _loc4_[2:]
+ _loc7_ = bytearray(len(encrypted_data))
+ for i in range(len(encrypted_data)):
+ _loc7_[i] = _loc4_[2 * i] * 16 + _loc4_[2 * i + 1]
return bytes(_loc7_)
@@ -117,10 +122,10 @@ class LetvIE(InfoExtractor):
'splatid': 101,
'format': 1,
'tkey': self.calc_time_key(int(time.time())),
- 'domain': 'www.letv.com'
+ 'domain': 'www.le.com'
}
play_json_req = sanitized_Request(
- 'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)
+ 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params)
)
cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
if cn_verification_proxy:
@@ -149,7 +154,7 @@ class LetvIE(InfoExtractor):
for format_id in formats:
if format_id in dispatch:
media_url = playurl['domain'][0] + dispatch[format_id][0]
- media_url += '&' + compat_urllib_parse.urlencode({
+ media_url += '&' + compat_urllib_parse_urlencode({
'm3v': 1,
'format': 1,
'expect': 3,
@@ -193,26 +198,51 @@ class LetvIE(InfoExtractor):
}
-class LetvTvIE(InfoExtractor):
- _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html'
+class LePlaylistIE(InfoExtractor):
+ _VALID_URL = r'https?://[a-z]+\.le\.com/(?!video)[a-z]+/(?P<id>[a-z0-9_]+)'
+
_TESTS = [{
- 'url': 'http://www.letv.com/tv/46177.html',
+ 'url': 'http://www.le.com/tv/46177.html',
'info_dict': {
'id': '46177',
'title': '美人天下',
'description': 'md5:395666ff41b44080396e59570dbac01c'
},
'playlist_count': 35
+ }, {
+ 'url': 'http://tv.le.com/izt/wuzetian/index.html',
+ 'info_dict': {
+ 'id': 'wuzetian',
+ 'title': '武媚娘传奇',
+ 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
+ },
+ # This playlist contains some extra videos other than the drama itself
+ 'playlist_mincount': 96
+ }, {
+ 'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml',
+ # This series is moved to http://www.le.com/tv/10005297.html
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.le.com/comic/92063.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html',
+ 'only_matching': True,
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url)
+
def _real_extract(self, url):
playlist_id = self._match_id(url)
page = self._download_webpage(url, playlist_id)
- media_urls = list(set(re.findall(
- r'http://www.letv.com/ptv/vplay/\d+.html', page)))
- entries = [self.url_result(media_url, ie='Letv')
- for media_url in media_urls]
+ # Currently old domain names are still used in playlists
+ media_ids = orderedSet(re.findall(
+ r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page))
+ entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le')
+ for media_id in media_ids]
title = self._html_search_meta('keywords', page,
fatal=False).split(',')[0]
@@ -222,31 +252,9 @@ class LetvTvIE(InfoExtractor):
playlist_description=description)
-class LetvPlaylistIE(LetvTvIE):
- _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html'
- _TESTS = [{
- 'url': 'http://tv.letv.com/izt/wuzetian/index.html',
- 'info_dict': {
- 'id': 'wuzetian',
- 'title': '武媚娘传奇',
- 'description': 'md5:e12499475ab3d50219e5bba00b3cb248'
- },
- # This playlist contains some extra videos other than the drama itself
- 'playlist_mincount': 96
- }, {
- 'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml',
- 'info_dict': {
- 'id': 'lswjzzjc',
- # The title should be "劲舞青春", but I can't find a simple way to
- # determine the playlist title
- 'title': '乐视午间自制剧场',
- 'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489'
- },
- 'playlist_mincount': 7
- }]
-
-
class LetvCloudIE(InfoExtractor):
+ # Most of *.letv.com is changed to *.le.com on 2016/01/02
+ # but yuntv.letv.com is kept, so also keep the extractor name
IE_DESC = '乐视云'
_VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+'
@@ -300,7 +308,7 @@ class LetvCloudIE(InfoExtractor):
}
self.sign_data(data)
return self._download_json(
- 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse.urlencode(data),
+ 'http://api.letvcloud.com/gpc.php?' + compat_urllib_parse_urlencode(data),
media_id, 'Downloading playJson data for type %s' % cf)
play_json = get_play_json(cf, time.time())
@@ -327,7 +335,7 @@ class LetvCloudIE(InfoExtractor):
formats.append({
'url': url,
'ext': determine_ext(decoded_url),
- 'format_id': int_or_none(play_url.get('vtype')),
+ 'format_id': str_or_none(play_url.get('vtype')),
'format_note': str_or_none(play_url.get('definition')),
'width': int_or_none(play_url.get('vwidth')),
'height': int_or_none(play_url.get('vheight')),
diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py
new file mode 100644
index 0000000..0a94366
--- /dev/null
+++ b/youtube_dl/extractor/libraryofcongress.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ parse_filesize,
+)
+
+
+class LibraryOfCongressIE(InfoExtractor):
+ IE_NAME = 'loc'
+ IE_DESC = 'Library of Congress'
+ _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)'
+ _TESTS = [{
+ # embedded via <div class="media-player"
+ 'url': 'http://loc.gov/item/90716351/',
+ 'md5': '353917ff7f0255aa6d4b80a034833de8',
+ 'info_dict': {
+ 'id': '90716351',
+ 'ext': 'mp4',
+ 'title': "Pa's trip to Mars",
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 0,
+ 'view_count': int,
+ },
+ }, {
+ # webcast embedded via mediaObjectId
+ 'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
+ 'info_dict': {
+ 'id': '5578',
+ 'ext': 'mp4',
+ 'title': 'Help! Preservation Training Needs Here, There & Everywhere',
+ 'duration': 3765,
+ 'view_count': int,
+ 'subtitles': 'mincount:1',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # with direct download links
+ 'url': 'https://www.loc.gov/item/78710669/',
+ 'info_dict': {
+ 'id': '78710669',
+ 'ext': 'mp4',
+ 'title': 'La vie et la passion de Jesus-Christ',
+ 'duration': 0,
+ 'view_count': int,
+ 'formats': 'mincount:4',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ media_id = self._search_regex(
+ (r'id=(["\'])media-player-(?P<id>.+?)\1',
+ r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
+ r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
+ r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'),
+ webpage, 'media id', group='id')
+
+ data = self._download_json(
+ 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
+ video_id)['mediaObject']
+
+ derivative = data['derivatives'][0]
+ media_url = derivative['derivativeUrl']
+
+ title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
+ webpage)
+
+ # Following algorithm was extracted from setAVSource js function
+ # found in webpage
+ media_url = media_url.replace('rtmp', 'https')
+
+ is_video = data.get('mediaType', 'v').lower() == 'v'
+ ext = determine_ext(media_url)
+ if ext not in ('mp4', 'mp3'):
+ media_url += '.mp4' if is_video else '.mp3'
+
+ if 'vod/mp4:' in media_url:
+ formats = [{
+ 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8',
+ 'format_id': 'hls',
+ 'ext': 'mp4',
+ 'protocol': 'm3u8_native',
+ 'quality': 1,
+ }]
+ elif 'vod/mp3:' in media_url:
+ formats = [{
+ 'url': media_url.replace('vod/mp3:', ''),
+ 'vcodec': 'none',
+ }]
+
+ download_urls = set()
+ for m in re.finditer(
+ r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
+ format_id = m.group('id').lower()
+ if format_id == 'gif':
+ continue
+ download_url = m.group('url')
+ if download_url in download_urls:
+ continue
+ download_urls.add(download_url)
+ formats.append({
+ 'url': download_url,
+ 'format_id': format_id,
+ 'filesize_approx': parse_filesize(m.group('size')),
+ })
+
+ self._sort_formats(formats)
+
+ duration = float_or_none(data.get('duration'))
+ view_count = int_or_none(data.get('viewCount'))
+
+ subtitles = {}
+ cc_url = data.get('ccUrl')
+ if cc_url:
+ subtitles.setdefault('en', []).append({
+ 'url': cc_url,
+ 'ext': 'ttml',
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'duration': duration,
+ 'view_count': view_count,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py
index f8cbca7..c2b4490 100644
--- a/youtube_dl/extractor/lifenews.py
+++ b/youtube_dl/extractor/lifenews.py
@@ -7,124 +7,164 @@ from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
determine_ext,
+ ExtractorError,
int_or_none,
+ parse_iso8601,
remove_end,
- unified_strdate,
- ExtractorError,
)
class LifeNewsIE(InfoExtractor):
- IE_NAME = 'lifenews'
- IE_DESC = 'LIFE | NEWS'
- _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'
+ IE_NAME = 'life'
+ IE_DESC = 'Life.ru'
+ _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://lifenews.ru/news/126342',
- 'md5': 'e1b50a5c5fb98a6a544250f2e0db570a',
+ # single video embedded via video/source
+ 'url': 'https://life.ru/t/новости/98736',
+ 'md5': '77c95eaefaca216e32a76a343ad89d23',
'info_dict': {
- 'id': '126342',
+ 'id': '98736',
'ext': 'mp4',
- 'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом',
- 'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.',
- 'thumbnail': 're:http://.*\.jpg',
- 'upload_date': '20140130',
+ 'title': 'Мужчина нашел дома архив оборонного завода',
+ 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
+ 'timestamp': 1344154740,
+ 'upload_date': '20120805',
+ 'view_count': int,
}
}, {
- # video in <iframe>
- 'url': 'http://lifenews.ru/news/152125',
+ # single video embedded via iframe
+ 'url': 'https://life.ru/t/новости/152125',
'md5': '77d19a6f0886cd76bdbf44b4d971a273',
'info_dict': {
'id': '152125',
'ext': 'mp4',
'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
+ 'timestamp': 1427961840,
'upload_date': '20150402',
+ 'view_count': int,
}
}, {
- 'url': 'http://lifenews.ru/news/153461',
- 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+ # two videos embedded via iframe
+ 'url': 'https://life.ru/t/новости/153461',
'info_dict': {
'id': '153461',
- 'ext': 'mp4',
'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
- 'upload_date': '20150505',
- }
+ 'timestamp': 1430825520,
+ 'view_count': int,
+ },
+ 'playlist': [{
+ 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
+ 'info_dict': {
+ 'id': '153461-video1',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'timestamp': 1430825520,
+ 'upload_date': '20150505',
+ },
+ }, {
+ 'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322',
+ 'info_dict': {
+ 'id': '153461-video2',
+ 'ext': 'mp4',
+ 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
+ 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
+ 'timestamp': 1430825520,
+ 'upload_date': '20150505',
+ },
+ }],
+ }, {
+ 'url': 'https://life.ru/t/новости/213035',
+ 'only_matching': True,
}, {
- 'url': 'http://lifenews.ru/video/13035',
+ 'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil',
'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- section = mobj.group('section')
-
- webpage = self._download_webpage(
- 'http://lifenews.ru/%s/%s' % (section, video_id),
- video_id, 'Downloading page')
-
- videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
- iframe_link = self._html_search_regex(
- '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None)
- if not videos and not iframe_link:
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_urls = re.findall(
+ r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
+
+ iframe_links = re.findall(
+ r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']',
+ webpage)
+
+ if not video_urls and not iframe_links:
raise ExtractorError('No media links available for %s' % video_id)
title = remove_end(
self._og_search_title(webpage),
- ' - Первый по срочным новостям — LIFE | NEWS')
+ ' - Life.ru')
description = self._og_search_description(webpage)
view_count = self._html_search_regex(
- r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False)
- comment_count = self._html_search_regex(
- r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
- webpage, 'comment count', fatal=False)
+ r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>',
+ webpage, 'view count', fatal=False, group='value')
- upload_date = self._html_search_regex(
- r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False)
- if upload_date is not None:
- upload_date = unified_strdate(upload_date)
+ timestamp = parse_iso8601(self._search_regex(
+ r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1',
+ webpage, 'upload date', fatal=False, group='value'))
common_info = {
'description': description,
'view_count': int_or_none(view_count),
- 'comment_count': int_or_none(comment_count),
- 'upload_date': upload_date,
+ 'timestamp': timestamp,
}
- def make_entry(video_id, media, video_number=None):
+ def make_entry(video_id, video_url, index=None):
cur_info = dict(common_info)
cur_info.update({
- 'id': video_id,
- 'url': media[1],
- 'thumbnail': media[0],
- 'title': title if video_number is None else '%s-video%s' % (title, video_number),
+ 'id': video_id if not index else '%s-video%s' % (video_id, index),
+ 'url': video_url,
+ 'title': title if not index else '%s (Видео %s)' % (title, index),
})
return cur_info
- if iframe_link:
- iframe_link = self._proto_relative_url(iframe_link, 'http:')
- cur_info = dict(common_info)
- cur_info.update({
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': title,
- 'url': iframe_link,
- })
+ def make_video_entry(video_id, video_url, index=None):
+ video_url = compat_urlparse.urljoin(url, video_url)
+ return make_entry(video_id, video_url, index)
+
+ def make_iframe_entry(video_id, video_url, index=None):
+ video_url = self._proto_relative_url(video_url, 'http:')
+ cur_info = make_entry(video_id, video_url, index)
+ cur_info['_type'] = 'url_transparent'
return cur_info
- if len(videos) == 1:
- return make_entry(video_id, videos[0])
- else:
- return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)]
+ if len(video_urls) == 1 and not iframe_links:
+ return make_video_entry(video_id, video_urls[0])
+
+ if len(iframe_links) == 1 and not video_urls:
+ return make_iframe_entry(video_id, iframe_links[0])
+
+ entries = []
+
+ if video_urls:
+ for num, video_url in enumerate(video_urls, 1):
+ entries.append(make_video_entry(video_id, video_url, num))
+
+ if iframe_links:
+ for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1):
+ entries.append(make_iframe_entry(video_id, iframe_link, num))
+
+ playlist = common_info.copy()
+ playlist.update(self.playlist_result(entries, video_id, title, description))
+ return playlist
class LifeEmbedIE(InfoExtractor):
IE_NAME = 'life:embed'
- _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
+ _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
_TEST = {
'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
@@ -148,7 +188,8 @@ class LifeEmbedIE(InfoExtractor):
ext = determine_ext(video_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', m3u8_id='m3u8'))
+ video_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='m3u8'))
else:
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
index 1a0625a..5d2c3e2 100644
--- a/youtube_dl/extractor/limelight.py
+++ b/youtube_dl/extractor/limelight.py
@@ -98,13 +98,19 @@ class LimelightBaseIE(InfoExtractor):
} for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')]
subtitles = {}
- for caption in properties.get('captions', {}):
+ for caption in properties.get('captions', []):
lang = caption.get('language_code')
subtitles_url = caption.get('url')
if lang and subtitles_url:
- subtitles[lang] = [{
+ subtitles.setdefault(lang, []).append({
'url': subtitles_url,
- }]
+ })
+ closed_captions_url = properties.get('closed_captions_url')
+ if closed_captions_url:
+ subtitles.setdefault('en', []).append({
+ 'url': closed_captions_url,
+ 'ext': 'ttml',
+ })
return {
'id': video_id,
@@ -123,7 +129,18 @@ class LimelightBaseIE(InfoExtractor):
class LimelightMediaIE(LimelightBaseIE):
IE_NAME = 'limelight'
- _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})'
+ _VALID_URL = r'''(?x)
+ (?:
+ limelight:media:|
+ https?://
+ (?:
+ link\.videoplatform\.limelight\.com/media/|
+ assets\.delvenetworks\.com/player/loader\.swf
+ )
+ \?.*?\bmediaId=
+ )
+ (?P<id>[a-z0-9]{32})
+ '''
_TESTS = [{
'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
'info_dict': {
@@ -158,6 +175,9 @@ class LimelightMediaIE(LimelightBaseIE):
# rtmp download
'skip_download': True,
},
+ }, {
+ 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452',
+ 'only_matching': True,
}]
_PLAYLIST_SERVICE_PATH = 'media'
_API_PATH = 'media'
@@ -176,15 +196,29 @@ class LimelightMediaIE(LimelightBaseIE):
class LimelightChannelIE(LimelightBaseIE):
IE_NAME = 'limelight:channel'
- _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})'
- _TEST = {
+ _VALID_URL = r'''(?x)
+ (?:
+ limelight:channel:|
+ https?://
+ (?:
+ link\.videoplatform\.limelight\.com/media/|
+ assets\.delvenetworks\.com/player/loader\.swf
+ )
+ \?.*?\bchannelId=
+ )
+ (?P<id>[a-z0-9]{32})
+ '''
+ _TESTS = [{
'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082',
'info_dict': {
'id': 'ab6a524c379342f9b23642917020c082',
'title': 'Javascript Sample Code',
},
'playlist_mincount': 3,
- }
+ }, {
+ 'url': 'http://assets.delvenetworks.com/player/loader.swf?channelId=ab6a524c379342f9b23642917020c082',
+ 'only_matching': True,
+ }]
_PLAYLIST_SERVICE_PATH = 'channel'
_API_PATH = 'channels'
@@ -207,15 +241,29 @@ class LimelightChannelIE(LimelightBaseIE):
class LimelightChannelListIE(LimelightBaseIE):
IE_NAME = 'limelight:channel_list'
- _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})'
- _TEST = {
+ _VALID_URL = r'''(?x)
+ (?:
+ limelight:channel_list:|
+ https?://
+ (?:
+ link\.videoplatform\.limelight\.com/media/|
+ assets\.delvenetworks\.com/player/loader\.swf
+ )
+ \?.*?\bchannelListId=
+ )
+ (?P<id>[a-z0-9]{32})
+ '''
+ _TESTS = [{
'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b',
'info_dict': {
'id': '301b117890c4465c8179ede21fd92e2b',
'title': 'Website - Hero Player',
},
'playlist_mincount': 2,
- }
+ }, {
+ 'url': 'https://assets.delvenetworks.com/player/loader.swf?channelListId=301b117890c4465c8179ede21fd92e2b',
+ 'only_matching': True,
+ }]
_PLAYLIST_SERVICE_PATH = 'channel_list'
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py
new file mode 100644
index 0000000..3356d01
--- /dev/null
+++ b/youtube_dl/extractor/litv.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ smuggle_url,
+ unsmuggle_url,
+)
+
+
+class LiTVIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)'
+
+ _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s'
+
+ _TESTS = [{
+ 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+ 'info_dict': {
+ 'id': 'VOD00041606',
+ 'title': '花千骨',
+ },
+ 'playlist_count': 50,
+ }, {
+ 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1',
+ 'info_dict': {
+ 'id': 'VOD00041610',
+ 'ext': 'mp4',
+ 'title': '花千骨第1集',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f',
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': True, # m3u8 download
+ },
+ 'skip': 'Georestricted to Taiwan',
+ }]
+
+ def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True):
+ episode_title = view_data['title']
+ content_id = season_list['contentId']
+
+ if prompt:
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id))
+
+ all_episodes = [
+ self.url_result(smuggle_url(
+ self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']),
+ {'force_noplaylist': True})) # To prevent infinite recursion
+ for episode in season_list['episode']]
+
+ return self.playlist_result(all_episodes, content_id, episode_title)
+
+ def _real_extract(self, url):
+ url, data = unsmuggle_url(url, {})
+
+ video_id = self._match_id(url)
+
+ noplaylist = self._downloader.params.get('noplaylist')
+ noplaylist_prompt = True
+ if 'force_noplaylist' in data:
+ noplaylist = data['force_noplaylist']
+ noplaylist_prompt = False
+
+ webpage = self._download_webpage(url, video_id)
+
+ view_data = dict(map(lambda t: (t[0], t[2]), re.findall(
+ r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2',
+ webpage)))
+
+ vod_data = self._parse_json(self._search_regex(
+ 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'),
+ video_id)
+
+ season_list = list(vod_data.get('seasonList', {}).values())
+ if season_list:
+ if not noplaylist:
+ return self._extract_playlist(
+ season_list[0], video_id, vod_data, view_data,
+ prompt=noplaylist_prompt)
+
+ if noplaylist_prompt:
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+ # In browsers `getMainUrl` request is always issued. Usually this
+ # endpoint gives the same result as the data embedded in the webpage.
+ # If georestricted, there are no embedded data, so an extra request is
+ # necessary to get the error code
+ video_data = self._parse_json(self._search_regex(
+ r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);',
+ webpage, 'video data', default='{}'), video_id)
+ if not video_data:
+ payload = {
+ 'assetId': view_data['assetId'],
+ 'watchDevices': vod_data['watchDevices'],
+ 'contentType': view_data['contentType'],
+ }
+ video_data = self._download_json(
+ 'https://www.litv.tv/vod/getMainUrl', video_id,
+ data=json.dumps(payload).encode('utf-8'),
+ headers={'Content-Type': 'application/json'})
+
+ if not video_data.get('fullpath'):
+ error_msg = video_data.get('errorMessage')
+ if error_msg == 'vod.error.outsideregionerror':
+ self.raise_geo_restricted('This video is available in Taiwan only')
+ if error_msg:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True)
+ raise ExtractorError('Unexpected result from %s' % self.IE_NAME)
+
+ formats = self._extract_m3u8_formats(
+ video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls')
+ for a_format in formats:
+ # LiTV HLS segments doesn't like compressions
+ a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True
+
+ title = view_data['title'] + view_data.get('secondaryMark', '')
+ description = view_data.get('description')
+ thumbnail = view_data.get('imageFile')
+ categories = [item['name'] for item in vod_data.get('category', [])]
+ episode = int_or_none(view_data.get('episode'))
+
+ return {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'categories': categories,
+ 'episode_number': episode,
+ }
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index 4684994..ea0565a 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor):
'ext': 'flv',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
- 'title': 'Most unlucky car accident'
+ 'title': 'Most unlucky car accident',
+ 'thumbnail': 're:^https?://.*\.jpg$'
}
}, {
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
@@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor):
'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
'uploader': 'ARD_Stinkt',
'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
+ 'thumbnail': 're:^https?://.*\.jpg$'
}
}, {
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
@@ -49,10 +51,19 @@ class LiveLeakIE(InfoExtractor):
'ext': 'mp4',
'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
'uploader': 'bony333',
- 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia'
+ 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia',
+ 'thumbnail': 're:^https?://.*\.jpg$'
}
}]
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)',
+ webpage)
+ if mobj:
+ return 'http://www.liveleak.com/view?i=%s' % mobj.group('id')
+
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -64,6 +75,7 @@ class LiveLeakIE(InfoExtractor):
age_limit = int_or_none(self._search_regex(
r'you confirm that you are ([0-9]+) years and over.',
webpage, 'age limit', default=None))
+ video_thumbnail = self._og_search_thumbnail(webpage)
sources_raw = self._search_regex(
r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
@@ -116,4 +128,5 @@ class LiveLeakIE(InfoExtractor):
'uploader': video_uploader,
'formats': formats,
'age_limit': age_limit,
+ 'thumbnail': video_thumbnail,
}
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py
index 38fb3d9..bc7894b 100644
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -14,6 +14,7 @@ from ..utils import (
xpath_with_ns,
xpath_text,
orderedSet,
+ update_url_query,
int_or_none,
float_or_none,
parse_iso8601,
@@ -64,7 +65,7 @@ class LivestreamIE(InfoExtractor):
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base_ele = find_xpath_attr(
smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase')
- base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/'
+ base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'
formats = []
video_nodes = smil.findall(self._xpath_ns('.//video', namespace))
@@ -72,7 +73,10 @@ class LivestreamIE(InfoExtractor):
for vn in video_nodes:
tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)
furl = (
- '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src']))
+ update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), {
+ 'v': '3.0.3',
+ 'fp': 'WIN% 14,0,0,145',
+ }))
if 'clipBegin' in vn.attrib:
furl += '&ssek=' + vn.attrib['clipBegin']
formats.append({
@@ -146,7 +150,7 @@ class LivestreamIE(InfoExtractor):
}
def _extract_stream_info(self, stream_info):
- broadcast_id = stream_info['broadcast_id']
+ broadcast_id = compat_str(stream_info['broadcast_id'])
is_live = stream_info.get('is_live')
formats = []
@@ -199,9 +203,10 @@ class LivestreamIE(InfoExtractor):
if not videos_info:
break
for v in videos_info:
+ v_id = compat_str(v['id'])
entries.append(self.url_result(
- 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v['id']),
- 'Livestream', v['id'], v['caption']))
+ 'http://livestream.com/accounts/%s/events/%s/videos/%s' % (account_id, event_id, v_id),
+ 'Livestream', v_id, v.get('caption')))
last_video = videos_info[-1]['id']
return self.playlist_result(entries, event_id, event_data['full_name'])
diff --git a/youtube_dl/extractor/localnews8.py b/youtube_dl/extractor/localnews8.py
new file mode 100644
index 0000000..aad3961
--- /dev/null
+++ b/youtube_dl/extractor/localnews8.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class LocalNews8IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?localnews8\.com/(?:[^/]+/)*(?P<display_id>[^/]+)/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.localnews8.com/news/rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings/35183304',
+ 'md5': 'be4d48aea61aa2bde7be2ee47691ad20',
+ 'info_dict': {
+ 'id': '35183304',
+ 'display_id': 'rexburg-business-turns-carbon-fiber-scraps-into-wedding-rings',
+ 'ext': 'mp4',
+ 'title': 'Rexburg business turns carbon fiber scraps into wedding ring',
+ 'description': 'The process was first invented by Lamborghini and less than a dozen companies around the world use it.',
+ 'duration': 153,
+ 'timestamp': 1441844822,
+ 'upload_date': '20150910',
+ 'uploader_id': 'api',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ partner_id = self._search_regex(
+ r'partnerId\s*[:=]\s*(["\'])(?P<id>\d+)\1',
+ webpage, 'partner id', group='id')
+ kaltura_id = self._search_regex(
+ r'videoIdString\s*[:=]\s*(["\'])kaltura:(?P<id>[0-9a-z_]+)\1',
+ webpage, 'videl id', group='id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id),
+ 'ie_key': 'Kaltura',
+ 'id': video_id,
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py
index 863efd8..1072405 100644
--- a/youtube_dl/extractor/lrt.py
+++ b/youtube_dl/extractor/lrt.py
@@ -37,6 +37,7 @@ class LRTIE(InfoExtractor):
r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*location\.hash\.substring\(1\)',
webpage, 'm3u8 url', group='url')
formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
+ self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description(webpage)
diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py
index d4e1ae9..2d50400 100644
--- a/youtube_dl/extractor/lynda.py
+++ b/youtube_dl/extractor/lynda.py
@@ -1,102 +1,100 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..compat import (
+ compat_HTTPError,
compat_str,
- compat_urllib_parse,
+ compat_urlparse,
)
from ..utils import (
ExtractorError,
- clean_html,
int_or_none,
- sanitized_Request,
+ urlencode_postdata,
)
class LyndaBaseIE(InfoExtractor):
- _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
+ _SIGNIN_URL = 'https://www.lynda.com/signin'
+ _PASSWORD_URL = 'https://www.lynda.com/signin/password'
+ _USER_URL = 'https://www.lynda.com/signin/user'
_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
_NETRC_MACHINE = 'lynda'
def _real_initialize(self):
self._login()
+ @staticmethod
+ def _check_error(json_string, key_or_keys):
+ keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys
+ for key in keys:
+ error = json_string.get(key)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+
+ def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url):
+ action_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html,
+ 'post url', default=fallback_action_url, group='url')
+
+ if not action_url.startswith('http'):
+ action_url = compat_urlparse.urljoin(self._SIGNIN_URL, action_url)
+
+ form_data = self._hidden_inputs(form_html)
+ form_data.update(extra_form_data)
+
+ try:
+ response = self._download_json(
+ action_url, None, note,
+ data=urlencode_postdata(form_data),
+ headers={
+ 'Referer': referrer_url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+ response = self._parse_json(e.cause.read().decode('utf-8'), None)
+ self._check_error(response, ('email', 'password'))
+ raise
+
+ self._check_error(response, 'ErrorMessage')
+
+ return response, action_url
+
def _login(self):
username, password = self._get_login_info()
if username is None:
return
- login_form = {
- 'username': username.encode('utf-8'),
- 'password': password.encode('utf-8'),
- 'remember': 'false',
- 'stayPut': 'false'
- }
- request = sanitized_Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- login_page = self._download_webpage(
- request, None, 'Logging in as %s' % username)
-
- # Not (yet) logged in
- m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page)
- if m is not None:
- response = m.group('json')
- response_json = json.loads(response)
- state = response_json['state']
-
- if state == 'notlogged':
- raise ExtractorError(
- 'Unable to login, incorrect username and/or password',
- expected=True)
-
- # This is when we get popup:
- # > You're already logged in to lynda.com on two devices.
- # > If you log in here, we'll log you out of another device.
- # So, we need to confirm this.
- if state == 'conflicted':
- confirm_form = {
- 'username': '',
- 'password': '',
- 'resolve': 'true',
- 'remember': 'false',
- 'stayPut': 'false',
- }
- request = sanitized_Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form).encode('utf-8'))
- login_page = self._download_webpage(
- request, None,
- 'Confirming log in and log out from another device')
-
- if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
- if 'login error' in login_page:
- mobj = re.search(
- r'(?s)<h1[^>]+class="topmost">(?P<title>[^<]+)</h1>\s*<div>(?P<description>.+?)</div>',
- login_page)
- if mobj:
- raise ExtractorError(
- 'lynda returned error: %s - %s'
- % (mobj.group('title'), clean_html(mobj.group('description'))),
- expected=True)
- raise ExtractorError('Unable to log in')
-
- def _logout(self):
- username, _ = self._get_login_info()
- if username is None:
+ # Step 1: download signin page
+ signin_page = self._download_webpage(
+ self._SIGNIN_URL, None, 'Downloading signin page')
+
+ # Already logged in
+ if any(re.search(p, signin_page) for p in (
+ 'isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')):
return
- self._download_webpage(
- 'http://www.lynda.com/ajax/logout.aspx', None,
- 'Logging out', 'Unable to log out', fatal=False)
+ # Step 2: submit email
+ signin_form = self._search_regex(
+ r'(?s)(<form[^>]+data-form-name=["\']signin["\'][^>]*>.+?</form>)',
+ signin_page, 'signin form')
+ signin_page, signin_url = self._login_step(
+ signin_form, self._PASSWORD_URL, {'email': username},
+ 'Submitting email', self._SIGNIN_URL)
+
+ # Step 3: submit password
+ password_form = signin_page['body']
+ self._login_step(
+ password_form, self._USER_URL, {'email': username, 'password': password},
+ 'Submitting password', signin_url)
class LyndaIE(LyndaBaseIE):
IE_NAME = 'lynda'
IE_DESC = 'lynda.com videos'
_VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P<id>\d+)'
- _NETRC_MACHINE = 'lynda'
_TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
@@ -214,14 +212,12 @@ class LyndaCourseIE(LyndaBaseIE):
'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id,
course_id, 'Downloading course JSON')
- self._logout()
-
if course.get('Status') == 'NotFound':
raise ExtractorError(
'Course %s does not exist' % course_id, expected=True)
unaccessible_videos = 0
- videos = []
+ entries = []
# Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
# by single video API anymore
@@ -231,20 +227,23 @@ class LyndaCourseIE(LyndaBaseIE):
if video.get('HasAccess') is False:
unaccessible_videos += 1
continue
- if video.get('ID'):
- videos.append(video['ID'])
+ video_id = video.get('ID')
+ if video_id:
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id),
+ 'ie_key': LyndaIE.ie_key(),
+ 'chapter': chapter.get('Title'),
+ 'chapter_number': int_or_none(chapter.get('ChapterIndex')),
+ 'chapter_id': compat_str(chapter.get('ID')),
+ })
if unaccessible_videos > 0:
self._downloader.report_warning(
'%s videos are only available for members (or paid members) and will not be downloaded. '
% unaccessible_videos + self._ACCOUNT_CREDENTIALS_HINT)
- entries = [
- self.url_result(
- 'http://www.lynda.com/%s/%s-4.html' % (course_path, video_id),
- 'Lynda')
- for video_id in videos]
-
course_title = course.get('Title')
+ course_description = course.get('Description')
- return self.playlist_result(entries, course_id, course_title)
+ return self.playlist_result(entries, course_id, course_title, course_description)
diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py
index 7e02583..d5945ad 100644
--- a/youtube_dl/extractor/m6.py
+++ b/youtube_dl/extractor/m6.py
@@ -8,7 +8,7 @@ from .common import InfoExtractor
class M6IE(InfoExtractor):
IE_NAME = 'm6'
- _VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
+ _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html'
_TEST = {
'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html',
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
index 71085f2..9a7098c 100644
--- a/youtube_dl/extractor/mailru.py
+++ b/youtube_dl/extractor/mailru.py
@@ -13,7 +13,7 @@ from ..utils import (
class MailRuIE(InfoExtractor):
IE_NAME = 'mailru'
IE_DESC = 'Видео@Mail.Ru'
- _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)'
_TESTS = [
{
@@ -61,6 +61,10 @@ class MailRuIE(InfoExtractor):
'duration': 6001,
},
'skip': 'Not accessible from Travis CI server',
+ },
+ {
+ 'url': 'http://m.my.mail.ru/mail/3sktvtr/video/_myvideo/138.html',
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py
new file mode 100644
index 0000000..f5d00e6
--- /dev/null
+++ b/youtube_dl/extractor/makerschannel.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MakersChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849',
+ 'md5': '624a512c6969236b5967bf9286345ad1',
+ 'info_dict': {
+ 'id': '849',
+ 'ext': 'mp4',
+ 'title': 'Landing a bus on a plane is an epic win',
+ 'uploader': 'ZoomIn',
+ 'description': 'md5:cd9cca2ea7b69b78be81d07020c97139',
+ }
+ }
+
+ def _real_extract(self, url):
+ id_type, url_id = re.match(self._VALID_URL, url).groups()
+ webpage = self._download_webpage(url, url_id)
+ video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data')
+
+ def extract_data_val(attr, fatal=False):
+ return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal)
+ minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'minoto:%s' % minoto_id,
+ 'id': extract_data_val('video-id', True),
+ 'title': extract_data_val('title', True),
+ 'description': extract_data_val('description'),
+ 'thumbnail': extract_data_val('image'),
+ 'uploader': extract_data_val('channel'),
+ }
diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py
deleted file mode 100644
index 92511a6..0000000
--- a/youtube_dl/extractor/malemotion.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
-
-
-class MalemotionIE(InfoExtractor):
- _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)'
- _TEST = {
- 'url': 'http://malemotion.com/video/bete-de-concours.ltc',
- 'md5': '3013e53a0afbde2878bc39998c33e8a5',
- 'info_dict': {
- 'id': 'ltc',
- 'ext': 'mp4',
- 'title': 'Bête de Concours',
- 'age_limit': 18,
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- video_url = compat_urllib_parse_unquote(self._search_regex(
- r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL'))
- video_title = self._html_search_regex(
- r'<title>(.*?)</title', webpage, 'title')
- video_thumbnail = self._search_regex(
- r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False)
-
- formats = [{
- 'url': video_url,
- 'ext': 'mp4',
- 'format_id': 'mp4',
- 'preference': 1,
- }]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'formats': formats,
- 'title': video_title,
- 'thumbnail': video_thumbnail,
- 'age_limit': 18,
- }
diff --git a/youtube_dl/extractor/matchtv.py b/youtube_dl/extractor/matchtv.py
index 28e0dfe..33b0b53 100644
--- a/youtube_dl/extractor/matchtv.py
+++ b/youtube_dl/extractor/matchtv.py
@@ -4,16 +4,12 @@ from __future__ import unicode_literals
import random
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
-from ..utils import (
- sanitized_Request,
- xpath_text,
-)
+from ..utils import xpath_text
class MatchTVIE(InfoExtractor):
- _VALID_URL = r'https?://matchtv\.ru/?#live-player'
- _TEST = {
+ _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)'
+ _TESTS = [{
'url': 'http://matchtv.ru/#live-player',
'info_dict': {
'id': 'matchtv-live',
@@ -24,12 +20,16 @@ class MatchTVIE(InfoExtractor):
'params': {
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://matchtv.ru/on-air/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = 'matchtv-live'
- request = sanitized_Request(
- 'http://player.matchtv.ntvplus.tv/player/smil?%s' % compat_urllib_parse.urlencode({
+ video_url = self._download_json(
+ 'http://player.matchtv.ntvplus.tv/player/smil', video_id,
+ query={
'ts': '',
'quality': 'SD',
'contentId': '561d2c0df7159b37178b4567',
@@ -40,13 +40,13 @@ class MatchTVIE(InfoExtractor):
'contentType': 'channel',
'timeShift': '0',
'platform': 'portal',
- }),
+ },
headers={
'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf',
- })
- video_url = self._download_json(request, video_id)['data']['videoUrl']
+ })['data']['videoUrl']
f4m_url = xpath_text(self._download_xml(video_url, video_id), './to')
formats = self._extract_f4m_formats(f4m_url, video_id)
+ self._sort_formats(formats)
return {
'id': video_id,
'title': self._live_title('Матч ТВ - Прямой эфир'),
diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py
index 425fc9e..2100583 100644
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@@ -14,7 +14,7 @@ from ..utils import (
class MDRIE(InfoExtractor):
IE_DESC = 'MDR.DE and KiKA'
- _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'
+ _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P<id>\d+)(?:_.+?)?\.html'
_TESTS = [{
# MDR regularly deletes its videos
@@ -49,8 +49,8 @@ class MDRIE(InfoExtractor):
'ext': 'mp4',
'title': 'Beutolomäus und der geheime Weihnachtswunsch',
'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
- 'timestamp': 1419047100,
- 'upload_date': '20141220',
+ 'timestamp': 1450950000,
+ 'upload_date': '20151224',
'duration': 4628,
'uploader': 'KIKA',
},
@@ -60,6 +60,9 @@ class MDRIE(InfoExtractor):
}, {
'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -68,8 +71,8 @@ class MDRIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
data_url = self._search_regex(
- r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
- webpage, 'data url', group='url')
+ r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P<url>.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1',
+ webpage, 'data url', group='url').replace('\/', '/')
doc = self._download_xml(
compat_urlparse.urljoin(url, data_url), video_id)
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py
index 67d6271..b6f00cc 100644
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -5,7 +5,6 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
- compat_urllib_parse,
compat_urllib_parse_unquote,
)
from ..utils import (
@@ -13,11 +12,12 @@ from ..utils import (
ExtractorError,
int_or_none,
sanitized_Request,
+ urlencode_postdata,
)
class MetacafeIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+ _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = 'metacafe'
@@ -81,6 +81,9 @@ class MetacafeIE(InfoExtractor):
'title': 'Open: This is Face the Nation, February 9',
'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
'duration': 96,
+ 'uploader': 'CBSI-NEW',
+ 'upload_date': '20140209',
+ 'timestamp': 1391959800,
},
'params': {
# rtmp download
@@ -117,7 +120,7 @@ class MetacafeIE(InfoExtractor):
'filters': '0',
'submit': "Continue - I'm over 18",
}
- request = sanitized_Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
+ request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
self.report_age_confirmation()
self._download_webpage(request, None, False, 'Unable to confirm age')
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py
index e303205..444ec03 100644
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -11,7 +11,7 @@ from ..utils import (
class MetacriticIE(InfoExtractor):
_VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
'info_dict': {
'id': '3698222',
@@ -20,7 +20,17 @@ class MetacriticIE(InfoExtractor):
'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
'duration': 221,
},
- }
+ 'skip': 'Not providing trailers anymore',
+ }, {
+ 'url': 'http://www.metacritic.com/game/playstation-4/tales-from-the-borderlands-a-telltale-game-series/trailers/5740315',
+ 'info_dict': {
+ 'id': '5740315',
+ 'ext': 'mp4',
+ 'title': 'Tales from the Borderlands - Finale: The Vault of the Traveler',
+ 'description': 'In the final episode of the season, all hell breaks loose. Jack is now in control of Helios\' systems, and he\'s ready to reclaim his rightful place as king of Hyperion (with or without you).',
+ 'duration': 114,
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py
new file mode 100644
index 0000000..9fbc74f
--- /dev/null
+++ b/youtube_dl/extractor/mgtv.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MGTVIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P<id>\d+)\.html'
+ IE_DESC = '芒果TV'
+
+ _TEST = {
+ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
+ 'md5': '1bdadcf760a0b90946ca68ee9a2db41a',
+ 'info_dict': {
+ 'id': '3116640',
+ 'ext': 'mp4',
+ 'title': '我是歌手第四季双年巅峰会:韩红李玟“双王”领军对抗',
+ 'description': '我是歌手第四季双年巅峰会',
+ 'duration': 7461,
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ api_data = self._download_json(
+ 'http://v.api.mgtv.com/player/video', video_id,
+ query={'video_id': video_id})['data']
+ info = api_data['info']
+
+ formats = []
+ for idx, stream in enumerate(api_data['stream']):
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ tbr = int_or_none(self._search_regex(
+ r'(\d+)\.mp4', stream_url, 'tbr', default=None))
+
+ def extract_format(stream_url, format_id, idx, query={}):
+ format_info = self._download_json(
+ stream_url, video_id,
+ note='Download video info for format %s' % format_id or '#%d' % idx, query=query)
+ return {
+ 'format_id': format_id,
+ 'url': format_info['info'],
+ 'ext': 'mp4',
+ 'tbr': tbr,
+ }
+
+ formats.append(extract_format(
+ stream_url, 'hls-%d' % tbr if tbr else None, idx * 2))
+ formats.append(extract_format(stream_url.replace(
+ '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031}))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': info['title'].strip(),
+ 'formats': formats,
+ 'description': info.get('desc'),
+ 'duration': int_or_none(info.get('duration')),
+ 'thumbnail': info.get('thumb'),
+ }
diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py
new file mode 100644
index 0000000..afd3e98
--- /dev/null
+++ b/youtube_dl/extractor/microsoftvirtualacademy.py
@@ -0,0 +1,192 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_xpath,
+)
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ smuggle_url,
+ unsmuggle_url,
+ xpath_text,
+)
+
+
+class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
+ def _extract_base_url(self, course_id, display_id):
+ return self._download_json(
+ 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
+ display_id, 'Downloading course base URL')
+
+ def _extract_chapter_and_title(self, title):
+ if not title:
+ return None, None
+ m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
+ return (int(m.group('chapter')), m.group('title')) if m else (None, title)
+
+
+class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
+ IE_NAME = 'mva'
+ IE_DESC = 'Microsoft Virtual Academy videos'
+ _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
+
+ _TESTS = [{
+ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
+ 'md5': '7826c44fc31678b12ad8db11f6b5abb9',
+ 'info_dict': {
+ 'id': 'gfVXISmEB_6804984382',
+ 'ext': 'mp4',
+ 'title': 'Course Introduction',
+ 'formats': 'mincount:3',
+ 'subtitles': {
+ 'en': [{
+ 'ext': 'ttml',
+ }],
+ },
+ }
+ }, {
+ 'url': 'mva:11788:gfVXISmEB_6804984382',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
+ mobj = re.match(self._VALID_URL, url)
+ course_id = mobj.group('course_id')
+ video_id = mobj.group('id')
+
+ base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
+
+ settings = self._download_xml(
+ '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
+ video_id, 'Downloading video settings XML')
+
+ _, title = self._extract_chapter_and_title(xpath_text(
+ settings, './/Title', 'title', fatal=True))
+
+ formats = []
+
+ for sources in settings.findall(compat_xpath('.//MediaSources')):
+ if sources.get('videoType') == 'smoothstreaming':
+ continue
+ for source in sources.findall(compat_xpath('./MediaSource')):
+ video_url = source.text
+ if not video_url or not video_url.startswith('http'):
+ continue
+ video_mode = source.get('videoMode')
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
+ codec = source.get('codec')
+ acodec, vcodec = [None] * 2
+ if codec:
+ codecs = codec.split(',')
+ if len(codecs) == 2:
+ acodec, vcodec = codecs
+ elif len(codecs) == 1:
+ vcodec = codecs[0]
+ formats.append({
+ 'url': video_url,
+ 'format_id': video_mode,
+ 'height': height,
+ 'acodec': acodec,
+ 'vcodec': vcodec,
+ })
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
+ subtitle_url = source.text
+ if not subtitle_url:
+ continue
+ subtitles.setdefault('en', []).append({
+ 'url': '%s/%s' % (base_url, subtitle_url),
+ 'ext': source.get('type'),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'subtitles': subtitles,
+ 'formats': formats
+ }
+
+
+class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
+ IE_NAME = 'mva:course'
+ IE_DESC = 'Microsoft Virtual Academy courses'
+ _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
+
+ _TESTS = [{
+ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+ 'info_dict': {
+ 'id': '11788',
+ 'title': 'Microsoft Azure Fundamentals: Virtual Machines',
+ },
+ 'playlist_count': 36,
+ }, {
+ # with emphasized chapters
+ 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
+ 'info_dict': {
+ 'id': '16335',
+ 'title': 'Developing Windows 10 Games with Construct 2',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
+ 'only_matching': True,
+ }, {
+ 'url': 'mva:course:11788',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
+ MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_id = mobj.group('id')
+ display_id = mobj.group('display_id')
+
+ base_url = self._extract_base_url(course_id, display_id)
+
+ manifest = self._download_json(
+ '%s/imsmanifestlite.json' % base_url,
+ display_id, 'Downloading course manifest JSON')['manifest']
+
+ organization = manifest['organizations']['organization'][0]
+
+ entries = []
+ for chapter in organization['item']:
+ chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
+ chapter_id = chapter.get('@identifier')
+ for item in chapter.get('item', []):
+ item_id = item.get('@identifier')
+ if not item_id:
+ continue
+ metadata = item.get('resource', {}).get('metadata') or {}
+ if metadata.get('learningresourcetype') != 'Video':
+ continue
+ _, title = self._extract_chapter_and_title(item.get('title'))
+ duration = parse_duration(metadata.get('duration'))
+ description = metadata.get('description')
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': smuggle_url(
+ 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'chapter': chapter_title,
+ 'chapter_number': chapter_number,
+ 'chapter_id': chapter_id,
+ })
+
+ title = organization.get('title') or manifest.get('metadata', {}).get('title')
+
+ return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py
index e46b23a..e6730b7 100644
--- a/youtube_dl/extractor/minhateca.py
+++ b/youtube_dl/extractor/minhateca.py
@@ -2,12 +2,12 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
int_or_none,
parse_duration,
parse_filesize,
sanitized_Request,
+ urlencode_postdata,
)
@@ -39,7 +39,7 @@ class MinhatecaIE(InfoExtractor):
]
req = sanitized_Request(
'http://minhateca.com.br/action/License/Download',
- data=compat_urllib_parse.urlencode(token_data))
+ data=urlencode_postdata(token_data))
req.add_header('Content-Type', 'application/x-www-form-urlencoded')
data = self._download_json(
req, video_id, note='Downloading metadata')
diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py
index 949ad11..e48eba3 100644
--- a/youtube_dl/extractor/ministrygrid.py
+++ b/youtube_dl/extractor/ministrygrid.py
@@ -1,8 +1,5 @@
from __future__ import unicode_literals
-import json
-import re
-
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -20,21 +17,28 @@ class MinistryGridIE(InfoExtractor):
'id': '3453494717001',
'ext': 'mp4',
'title': 'The Gospel by Numbers',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20140410',
'description': 'Coming soon from T4G 2014!',
- 'uploader': 'LifeWay Christian Resources (MG)',
+ 'uploader_id': '2034960640001',
+ 'timestamp': 1397145591,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
+ 'add_ie': ['TDSLifeway'],
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- portlets_json = self._search_regex(
- r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list')
- portlets = json.loads(portlets_json)
+ portlets = self._parse_json(self._search_regex(
+ r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list'),
+ video_id)
pl_id = self._search_regex(
- r'<!--\s*p_l_id - ([0-9]+)<br>', webpage, 'p_l_id')
+ r'getPlid:function\(\){return"(\d+)"}', webpage, 'p_l_id')
for i, portlet in enumerate(portlets):
portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet)
@@ -46,12 +50,8 @@ class MinistryGridIE(InfoExtractor):
r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe',
default=None)
if video_iframe_url:
- surl = smuggle_url(
- video_iframe_url, {'force_videoid': video_id})
- return {
- '_type': 'url',
- 'id': video_id,
- 'url': surl,
- }
+ return self.url_result(
+ smuggle_url(video_iframe_url, {'force_videoid': video_id}),
+ video_id=video_id)
raise ExtractorError('Could not find video iframe in any portlets')
diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py
new file mode 100644
index 0000000..959a105
--- /dev/null
+++ b/youtube_dl/extractor/minoto.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MinotoIE(InfoExtractor):
+ _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ player_id = mobj.group('player_id') or '1'
+ video_id = mobj.group('id')
+ video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id)
+ video_metadata = video_data['video-metadata']
+ formats = []
+ for fmt in video_data['video-files']:
+ fmt_url = fmt.get('url')
+ if not fmt_url:
+ continue
+ container = fmt.get('container')
+ if container == 'hls':
+ formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+ else:
+ fmt_profile = fmt.get('profile') or {}
+ f = {
+ 'format_id': fmt_profile.get('name-short'),
+ 'format_note': fmt_profile.get('name'),
+ 'url': fmt_url,
+ 'container': container,
+ 'tbr': int_or_none(fmt.get('bitrate')),
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'width': int_or_none(fmt.get('width')),
+ 'height': int_or_none(fmt.get('height')),
+ }
+ codecs = fmt.get('codecs')
+ if codecs:
+ codecs = codecs.split(',')
+ if len(codecs) == 2:
+ f.update({
+ 'vcodec': codecs[0],
+ 'acodec': codecs[1],
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_metadata['title'],
+ 'description': video_metadata.get('description'),
+ 'thumbnail': video_metadata.get('video-poster', {}).get('url'),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index 29ca457..1aea78d 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -91,7 +91,7 @@ class MITIE(TechTVMITIE):
class OCWMITIE(InfoExtractor):
IE_NAME = 'ocw.mit.edu'
- _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+ _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
_BASE_URL = 'http://ocw.mit.edu/'
_TESTS = [
@@ -99,7 +99,7 @@ class OCWMITIE(InfoExtractor):
'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
'info_dict': {
'id': 'EObHWIEKGjA',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
'upload_date': '20121109',
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index c595f20..5a00cd3 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -1,33 +1,57 @@
+# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
- encode_dict,
get_element_by_attribute,
int_or_none,
+ remove_start,
)
class MiTeleIE(InfoExtractor):
IE_DESC = 'mitele.es'
- _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
+ _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
- 'md5': '0ff1a13aebb35d9bc14081ff633dd324',
+ # MD5 is unstable
'info_dict': {
'id': '0NF1jJnxS1Wu3pHrmvFyw2',
'display_id': 'programa-144',
'ext': 'flv',
'title': 'Tor, la web invisible',
'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
+ 'series': 'Diario de',
+ 'season': 'La redacción',
+ 'episode': 'Programa 144',
'thumbnail': 're:(?i)^https?://.*\.jpg$',
'duration': 2913,
},
+ }, {
+ # no explicit title
+ 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/',
+ 'info_dict': {
+ 'id': 'eLZSwoEd1S3pVyUm8lc6F',
+ 'display_id': 'programa-226',
+ 'ext': 'flv',
+ 'title': 'Cuarto Milenio - Temporada 6 - Programa 226',
+ 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',
+ 'series': 'Cuarto Milenio',
+ 'season': 'Temporada 6',
+ 'episode': 'Programa 226',
+ 'thumbnail': 're:(?i)^https?://.*\.jpg$',
+ 'duration': 7312,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -60,7 +84,7 @@ class MiTeleIE(InfoExtractor):
'sta': '0',
}
media = self._download_json(
- '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data))),
+ '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
display_id, 'Downloading %s JSON' % location['loc'])
file_ = media.get('file')
if not file_:
@@ -68,9 +92,25 @@ class MiTeleIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
display_id, f4m_id=loc))
+ self._sort_formats(formats)
title = self._search_regex(
- r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title')
+ r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>',
+ webpage, 'title', default=None)
+
+ mobj = re.search(r'''(?sx)
+ class="Destacado-text"[^>]*>.*?<h1>\s*
+ <span>(?P<series>[^<]+)</span>\s*
+ <span>(?P<season>[^<]+)</span>\s*
+ <span>(?P<episode>[^<]+)</span>''', webpage)
+ series, season, episode = mobj.groups() if mobj else [None] * 3
+
+ if not title:
+ if mobj:
+ title = '%s - %s - %s' % (series, season, episode)
+ else:
+ title = remove_start(self._search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ')
video_id = self._search_regex(
r'data-media-id\s*=\s*"([^"]+)"', webpage,
@@ -83,6 +123,9 @@ class MiTeleIE(InfoExtractor):
'display_id': display_id,
'title': title,
'description': get_element_by_attribute('class', 'text', webpage),
+ 'series': series,
+ 'season': season,
+ 'episode': episode,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index c2b7ed9..560fe18 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -1,25 +1,35 @@
from __future__ import unicode_literals
+import base64
+import functools
+import itertools
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+ compat_chr,
+ compat_ord,
+ compat_urllib_parse_unquote,
+ compat_urlparse,
+)
from ..utils import (
+ clean_html,
ExtractorError,
- HEADRequest,
+ OnDemandPagedList,
+ parse_count,
str_to_int,
)
class MixcloudIE(InfoExtractor):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
IE_NAME = 'mixcloud'
_TESTS = [{
'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
'info_dict': {
'id': 'dholbach-cryptkeeper',
- 'ext': 'mp3',
+ 'ext': 'm4a',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
'uploader': 'Daniel Holbach',
@@ -37,22 +47,22 @@ class MixcloudIE(InfoExtractor):
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
'uploader': 'Gilles Peterson Worldwide',
'uploader_id': 'gillespeterson',
- 'thumbnail': 're:https?://.*/images/',
+ 'thumbnail': 're:https?://.*',
'view_count': int,
'like_count': int,
},
}]
- def _check_url(self, url, track_id, ext):
- try:
- # We only want to know if the request succeed
- # don't download the whole file
- self._request_webpage(
- HEADRequest(url), track_id,
- 'Trying %s URL' % ext)
- return True
- except ExtractorError:
- return False
+ # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
+ @staticmethod
+ def _decrypt_play_info(play_info):
+ KEY = 'pleasedontdownloadourmusictheartistswontgetpaid'
+
+ play_info = base64.b64decode(play_info.encode('ascii'))
+
+ return ''.join([
+ compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)]))
+ for idx, ch in enumerate(play_info)])
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -62,14 +72,19 @@ class MixcloudIE(InfoExtractor):
webpage = self._download_webpage(url, track_id)
- preview_url = self._search_regex(
- r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url')
- song_url = re.sub(r'audiocdn(\d+)', r'stream\1', preview_url)
- song_url = song_url.replace('/previews/', '/c/originals/')
- if not self._check_url(song_url, track_id, 'mp3'):
- song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
- if not self._check_url(song_url, track_id, 'm4a'):
- raise ExtractorError('Unable to extract track url')
+ message = self._html_search_regex(
+ r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
+ webpage, 'error message', default=None)
+
+ encrypted_play_info = self._search_regex(
+ r'm-play-info="([^"]+)"', webpage, 'play info')
+ play_info = self._parse_json(
+ self._decrypt_play_info(encrypted_play_info), track_id)
+
+ if message and 'stream_url' not in play_info:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+
+ song_url = play_info['stream_url']
PREFIX = (
r'm-play-on-spacebar[^>]+'
@@ -85,13 +100,13 @@ class MixcloudIE(InfoExtractor):
uploader_id = self._search_regex(
r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
description = self._og_search_description(webpage)
- like_count = str_to_int(self._search_regex(
- r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"',
- webpage, 'like count', fatal=False))
+ like_count = parse_count(self._search_regex(
+ r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',
+ webpage, 'like count', default=None))
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>'],
- webpage, 'play count', fatal=False))
+ webpage, 'play count', default=None))
return {
'id': track_id,
@@ -104,3 +119,201 @@ class MixcloudIE(InfoExtractor):
'view_count': view_count,
'like_count': like_count,
}
+
+
+class MixcloudPlaylistBaseIE(InfoExtractor):
+ _PAGE_SIZE = 24
+
+ def _find_urls_in_page(self, page):
+ for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page):
+ yield self.url_result(
+ compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)),
+ MixcloudIE.ie_key())
+
+ def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None):
+ real_page_number = real_page_number or current_page + 1
+ return self._download_webpage(
+ 'https://www.mixcloud.com/%s/' % path, video_id,
+ note='Download %s (page %d)' % (page_name, current_page + 1),
+ errnote='Unable to download %s' % page_name,
+ query={'page': real_page_number, 'list': 'main', '_ajax': '1'},
+ headers={'X-Requested-With': 'XMLHttpRequest'})
+
+ def _tracks_page_func(self, page, video_id, page_name, current_page):
+ resp = self._fetch_tracks_page(page, video_id, page_name, current_page)
+
+ for item in self._find_urls_in_page(resp):
+ yield item
+
+ def _get_user_description(self, page_content):
+ return self._html_search_regex(
+ r'<div[^>]+class="description-text"[^>]*>(.+?)</div>',
+ page_content, 'user description', fatal=False)
+
+
+class MixcloudUserIE(MixcloudPlaylistBaseIE):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
+ IE_NAME = 'mixcloud:user'
+
+ _TESTS = [{
+ 'url': 'http://www.mixcloud.com/dholbach/',
+ 'info_dict': {
+ 'id': 'dholbach_uploads',
+ 'title': 'Daniel Holbach (uploads)',
+ 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'http://www.mixcloud.com/dholbach/uploads/',
+ 'info_dict': {
+ 'id': 'dholbach_uploads',
+ 'title': 'Daniel Holbach (uploads)',
+ 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'http://www.mixcloud.com/dholbach/favorites/',
+ 'info_dict': {
+ 'id': 'dholbach_favorites',
+ 'title': 'Daniel Holbach (favorites)',
+ 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ },
+ 'params': {
+ 'playlist_items': '1-100',
+ },
+ 'playlist_mincount': 100,
+ }, {
+ 'url': 'http://www.mixcloud.com/dholbach/listens/',
+ 'info_dict': {
+ 'id': 'dholbach_listens',
+ 'title': 'Daniel Holbach (listens)',
+ 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ },
+ 'params': {
+ 'playlist_items': '1-100',
+ },
+ 'playlist_mincount': 100,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('user')
+ list_type = mobj.group('type')
+
+ # if only a profile URL was supplied, default to download all uploads
+ if list_type is None:
+ list_type = 'uploads'
+
+ video_id = '%s_%s' % (user_id, list_type)
+
+ profile = self._download_webpage(
+ 'https://www.mixcloud.com/%s/' % user_id, video_id,
+ note='Downloading user profile',
+ errnote='Unable to download user profile')
+
+ username = self._og_search_title(profile)
+ description = self._get_user_description(profile)
+
+ entries = OnDemandPagedList(
+ functools.partial(
+ self._tracks_page_func,
+ '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type),
+ self._PAGE_SIZE, use_cache=True)
+
+ return self.playlist_result(
+ entries, video_id, '%s (%s)' % (username, list_type), description)
+
+
+class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
+ IE_NAME = 'mixcloud:playlist'
+
+ _TESTS = [{
+ 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/',
+ 'info_dict': {
+ 'id': 'RedBullThre3style_tokyo-finalists-2015',
+ 'title': 'National Champions 2015',
+ 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3',
+ },
+ 'playlist_mincount': 16,
+ }, {
+ 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
+ 'info_dict': {
+ 'id': 'maxvibes_jazzcat-on-ness-radio',
+ 'title': 'Jazzcat on Ness Radio',
+ 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263',
+ },
+ 'playlist_mincount': 23
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('user')
+ playlist_id = mobj.group('playlist')
+ video_id = '%s_%s' % (user_id, playlist_id)
+
+ profile = self._download_webpage(
+ url, user_id,
+ note='Downloading playlist page',
+ errnote='Unable to download playlist page')
+
+ description = self._get_user_description(profile)
+ playlist_title = self._html_search_regex(
+ r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>',
+ profile, 'playlist title')
+
+ entries = OnDemandPagedList(
+ functools.partial(
+ self._tracks_page_func,
+ '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
+ self._PAGE_SIZE)
+
+ return self.playlist_result(entries, video_id, playlist_title, description)
+
+
+class MixcloudStreamIE(MixcloudPlaylistBaseIE):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
+ IE_NAME = 'mixcloud:stream'
+
+ _TEST = {
+ 'url': 'https://www.mixcloud.com/FirstEar/stream/',
+ 'info_dict': {
+ 'id': 'FirstEar',
+ 'title': 'First Ear',
+ 'description': 'Curators of good music\nfirstearmusic.com',
+ },
+ 'playlist_mincount': 192,
+ }
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, user_id)
+
+ entries = []
+ prev_page_url = None
+
+ def _handle_page(page):
+ entries.extend(self._find_urls_in_page(page))
+ return self._search_regex(
+ r'm-next-page-url="([^"]+)"', page,
+ 'next page URL', default=None)
+
+ next_page_url = _handle_page(webpage)
+
+ for idx in itertools.count(0):
+ if not next_page_url or prev_page_url == next_page_url:
+ break
+
+ prev_page_url = next_page_url
+ current_page = int(self._search_regex(
+ r'\?page=(\d+)', next_page_url, 'next page number'))
+
+ next_page_url = _handle_page(self._fetch_tracks_page(
+ '%s/stream' % user_id, user_id, 'stream', idx,
+ real_page_number=current_page))
+
+ username = self._og_search_title(webpage)
+ description = self._get_user_description(webpage)
+
+ return self.playlist_result(entries, user_id, username, description)
diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py
new file mode 100644
index 0000000..e3f42e7
--- /dev/null
+++ b/youtube_dl/extractor/mnet.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+)
+
+
+class MnetIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.mnet.com/tv/vod/171008',
+ 'info_dict': {
+ 'id': '171008',
+ 'title': 'SS_이해인@히든박스',
+ 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55',
+ 'duration': 88,
+ 'upload_date': '20151231',
+ 'timestamp': 1451564040,
+ 'age_limit': 0,
+ 'thumbnails': 'mincount:5',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'ext': 'flv',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://mnet.interest.me/tv/vod/172790',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._download_json(
+ 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id,
+ video_id, 'Downloading vod config JSON')['data']['info']
+
+ title = info['title']
+
+ rtmp_info = self._download_json(
+ info['cdn'], video_id, 'Downloading vod cdn JSON')
+
+ formats = [{
+ 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'],
+ 'ext': 'flv',
+ 'page_url': url,
+ 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318',
+ }]
+
+ description = info.get('ment')
+ duration = parse_duration(info.get('time'))
+ timestamp = parse_iso8601(info.get('date'), delimiter=' ')
+ age_limit = info.get('adult')
+ if age_limit is not None:
+ age_limit = 0 if age_limit == 'N' else 18
+ thumbnails = [{
+ 'id': thumb_format,
+ 'url': thumb['url'],
+ 'width': int_or_none(thumb.get('width')),
+ 'height': int_or_none(thumb.get('height')),
+ } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'age_limit': age_limit,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py
index d930b96..978d5d5 100644
--- a/youtube_dl/extractor/moevideo.py
+++ b/youtube_dl/extractor/moevideo.py
@@ -5,11 +5,11 @@ import json
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
sanitized_Request,
+ urlencode_postdata,
)
@@ -77,7 +77,7 @@ class MoeVideoIE(InfoExtractor):
],
]
r_json = json.dumps(r)
- post = compat_urllib_parse.urlencode({'r': r_json})
+ post = urlencode_postdata({'r': r_json})
req = sanitized_Request(self._API_URL, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py
index f6bf94f..b208820 100644
--- a/youtube_dl/extractor/moniker.py
+++ b/youtube_dl/extractor/moniker.py
@@ -5,11 +5,11 @@ import os.path
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
remove_start,
sanitized_Request,
+ urlencode_postdata,
)
@@ -88,7 +88,7 @@ class MonikerIE(InfoExtractor):
fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage)
data = dict(fields)
- post = compat_urllib_parse.urlencode(data)
+ post = urlencode_postdata(data)
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py
deleted file mode 100644
index 7cc7f05..0000000
--- a/youtube_dl/extractor/mooshare.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_urllib_parse
-from ..utils import (
- ExtractorError,
- sanitized_Request,
-)
-
-
-class MooshareIE(InfoExtractor):
- IE_NAME = 'mooshare'
- IE_DESC = 'Mooshare.biz'
- _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})'
-
- _TESTS = [
- {
- 'url': 'http://mooshare.biz/8dqtk4bjbp8g',
- 'md5': '4e14f9562928aecd2e42c6f341c8feba',
- 'info_dict': {
- 'id': '8dqtk4bjbp8g',
- 'ext': 'mp4',
- 'title': 'Comedy Football 2011 - (part 1-2)',
- 'duration': 893,
- },
- },
- {
- 'url': 'http://mooshare.biz/aipjtoc4g95j',
- 'info_dict': {
- 'id': 'aipjtoc4g95j',
- 'ext': 'mp4',
- 'title': 'Orange Caramel Dashing Through the Snow',
- 'duration': 212,
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- page = self._download_webpage(url, video_id, 'Downloading page')
-
- if re.search(r'>Video Not Found or Deleted<', page) is not None:
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
-
- hash_key = self._html_search_regex(r'<input type="hidden" name="hash" value="([^"]+)">', page, 'hash')
- title = self._html_search_regex(r'(?m)<div class="blockTitle">\s*<h2>Watch ([^<]+)</h2>', page, 'title')
-
- download_form = {
- 'op': 'download1',
- 'id': video_id,
- 'hash': hash_key,
- }
-
- request = sanitized_Request(
- 'http://mooshare.biz/%s' % video_id, compat_urllib_parse.urlencode(download_form))
- request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-
- self._sleep(5, video_id)
-
- video_page = self._download_webpage(request, video_id, 'Downloading video page')
-
- thumbnail = self._html_search_regex(r'image:\s*"([^"]+)",', video_page, 'thumbnail', fatal=False)
- duration_str = self._html_search_regex(r'duration:\s*"(\d+)",', video_page, 'duration', fatal=False)
- duration = int(duration_str) if duration_str is not None else None
-
- formats = []
-
- # SD video
- mobj = re.search(r'(?m)file:\s*"(?P<url>[^"]+)",\s*provider:', video_page)
- if mobj is not None:
- formats.append({
- 'url': mobj.group('url'),
- 'format_id': 'sd',
- 'format': 'SD',
- })
-
- # HD video
- mobj = re.search(r'\'hd-2\': { file: \'(?P<url>[^\']+)\' },', video_page)
- if mobj is not None:
- formats.append({
- 'url': mobj.group('url'),
- 'format_id': 'hd',
- 'format': 'HD',
- })
-
- # rtmp video
- mobj = re.search(r'(?m)file: "(?P<playpath>[^"]+)",\s*streamer: "(?P<rtmpurl>rtmp://[^"]+)",', video_page)
- if mobj is not None:
- formats.append({
- 'url': mobj.group('rtmpurl'),
- 'play_path': mobj.group('playpath'),
- 'rtmp_live': False,
- 'ext': 'mp4',
- 'format_id': 'rtmp',
- 'format': 'HD',
- })
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
index 97d5da6..5e1a8a7 100644
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -5,62 +5,73 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
str_to_int,
unified_strdate,
)
class MotherlessIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
- _TESTS = [
- {
- 'url': 'http://motherless.com/AC3FFE1',
- 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
- 'info_dict': {
- 'id': 'AC3FFE1',
- 'ext': 'mp4',
- 'title': 'Fucked in the ass while playing PS3',
- 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
- 'upload_date': '20100913',
- 'uploader_id': 'famouslyfuckedup',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
- },
- {
- 'url': 'http://motherless.com/532291B',
- 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
- 'info_dict': {
- 'id': '532291B',
- 'ext': 'mp4',
- 'title': 'Amazing girl playing the omegle game, PERFECT!',
- 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen', 'game', 'hairy'],
- 'upload_date': '20140622',
- 'uploader_id': 'Sulivana7x',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
+ _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)'
+ _TESTS = [{
+ 'url': 'http://motherless.com/AC3FFE1',
+ 'md5': '310f62e325a9fafe64f68c0bccb6e75f',
+ 'info_dict': {
+ 'id': 'AC3FFE1',
+ 'ext': 'mp4',
+ 'title': 'Fucked in the ass while playing PS3',
+ 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'],
+ 'upload_date': '20100913',
+ 'uploader_id': 'famouslyfuckedup',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'http://motherless.com/532291B',
+ 'md5': 'bc59a6b47d1f958e61fbd38a4d31b131',
+ 'info_dict': {
+ 'id': '532291B',
+ 'ext': 'mp4',
+ 'title': 'Amazing girl playing the omegle game, PERFECT!',
+ 'categories': ['Amateur', 'webcam', 'omegle', 'pink', 'young', 'masturbate', 'teen',
+ 'game', 'hairy'],
+ 'upload_date': '20140622',
+ 'uploader_id': 'Sulivana7x',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
},
- {
- 'url': 'http://motherless.com/g/cosplay/633979F',
- 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
- 'info_dict': {
- 'id': '633979F',
- 'ext': 'mp4',
- 'title': 'Turtlette',
- 'categories': ['superheroine heroine superher'],
- 'upload_date': '20140827',
- 'uploader_id': 'shade0230',
- 'thumbnail': 're:http://.*\.jpg',
- 'age_limit': 18,
- }
+ 'skip': '404',
+ }, {
+ 'url': 'http://motherless.com/g/cosplay/633979F',
+ 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0',
+ 'info_dict': {
+ 'id': '633979F',
+ 'ext': 'mp4',
+ 'title': 'Turtlette',
+ 'categories': ['superheroine heroine superher'],
+ 'upload_date': '20140827',
+ 'uploader_id': 'shade0230',
+ 'thumbnail': 're:http://.*\.jpg',
+ 'age_limit': 18,
}
- ]
+ }, {
+ # no keywords
+ 'url': 'http://motherless.com/8B4BBC1',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ if any(p in webpage for p in (
+ '<title>404 - MOTHERLESS.COM<',
+ ">The page you're looking for cannot be found.<")):
+ raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+ if '>The content you are trying to view is for friends only.' in webpage:
+ raise ExtractorError('Video %s is for friends only' % video_id, expected=True)
+
title = self._html_search_regex(
r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
video_url = self._html_search_regex(
@@ -86,7 +97,7 @@ class MotherlessIE(InfoExtractor):
r'"thumb-member-username">\s+<a href="/m/([^"]+)"',
webpage, 'uploader_id')
- categories = self._html_search_meta('keywords', webpage)
+ categories = self._html_search_meta('keywords', webpage, default=None)
if categories:
categories = [cat.strip() for cat in categories.split(',')]
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
index c1a482d..370328b 100644
--- a/youtube_dl/extractor/motorsport.py
+++ b/youtube_dl/extractor/motorsport.py
@@ -9,7 +9,7 @@ from ..compat import (
class MotorsportIE(InfoExtractor):
IE_DESC = 'motorsport.com'
- _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])'
+ _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])'
_TEST = {
'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
'info_dict': {
diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py
index 1564cb7..d0cb827 100644
--- a/youtube_dl/extractor/movieclips.py
+++ b/youtube_dl/extractor/movieclips.py
@@ -2,39 +2,48 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import sanitized_Request
+from ..utils import (
+ smuggle_url,
+ float_or_none,
+ parse_iso8601,
+ update_url_query,
+)
class MovieClipsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/.+-(?P<id>\d+)(?:\?|$)'
_TEST = {
- 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5',
+ 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597',
+ 'md5': '42b5a0352d4933a7bd54f2104f481244',
'info_dict': {
'id': 'pKIGmG83AqD9',
- 'display_id': 'warcraft-trailer-1-561180739597',
'ext': 'mp4',
'title': 'Warcraft Trailer 1',
'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.',
'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1446843055,
+ 'upload_date': '20151106',
+ 'uploader': 'Movieclips',
},
'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
- display_id = self._match_id(url)
-
- req = sanitized_Request(url)
- # it doesn't work if it thinks the browser it's too old
- req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)')
- webpage = self._download_webpage(req, display_id)
- theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link')
- title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com</title>', webpage, 'title')
- description = self._html_search_meta('description', webpage)
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ video = next(v for v in self._parse_json(self._search_regex(
+ r'var\s+__REACT_ENGINE__\s*=\s*({.+});',
+ webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id)
return {
'_type': 'url_transparent',
- 'url': theplatform_link,
- 'title': title,
- 'display_id': display_id,
- 'description': description,
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(update_url_query(
+ video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}),
+ 'title': self._og_search_title(webpage),
+ 'description': self._html_search_meta('description', webpage),
+ 'duration': float_or_none(video.get('duration')),
+ 'timestamp': parse_iso8601(video.get('dateCreated')),
+ 'thumbnail': video.get('defaultImage'),
+ 'uploader': video.get('provider'),
}
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index ed06836..dd06395 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -4,8 +4,9 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_str,
+ compat_xpath,
)
from ..utils import (
ExtractorError,
@@ -17,6 +18,7 @@ from ..utils import (
unescapeHTML,
url_basename,
RegexNotFoundError,
+ xpath_text,
)
@@ -83,9 +85,10 @@ class MTVServicesInfoExtractor(InfoExtractor):
rtmp_video_url = rendition.find('./src').text
if rtmp_video_url.endswith('siteunavail.png'):
continue
+ new_url = self._transform_rtmp_url(rtmp_video_url)
formats.append({
- 'ext': ext,
- 'url': self._transform_rtmp_url(rtmp_video_url),
+ 'ext': 'flv' if new_url.startswith('rtmp') else ext,
+ 'url': new_url,
'format_id': rendition.get('bitrate'),
'width': int(rendition.get('width')),
'height': int(rendition.get('height')),
@@ -130,11 +133,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
message += item.text
raise ExtractorError(message, expected=True)
- description_node = itemdoc.find('description')
- if description_node is not None:
- description = description_node.text.strip()
- else:
- description = None
+ description = xpath_text(itemdoc, 'description')
title_el = None
if title_el is None:
@@ -142,9 +141,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
itemdoc, './/{http://search.yahoo.com/mrss/}category',
'scheme', 'urn:mtvn:video_title')
if title_el is None:
- title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')
+ title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title'))
if title_el is None:
- title_el = itemdoc.find('.//title') or itemdoc.find('./title')
+ title_el = itemdoc.find(compat_xpath('.//title'))
if title_el.text is None:
title_el = None
@@ -174,7 +173,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
data = {'uri': uri}
if self._LANG:
data['lang'] = self._LANG
- return compat_urllib_parse.urlencode(data)
+ return compat_urllib_parse_urlencode(data)
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py
index 50d92b5..2174e56 100644
--- a/youtube_dl/extractor/musicplayon.py
+++ b/youtube_dl/extractor/musicplayon.py
@@ -1,17 +1,21 @@
# encoding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..compat import compat_urlparse
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ mimetype2ext,
+)
class MusicPlayOnIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=\d+&play)=(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://en.musicplayon.com/play?v=433377',
+ 'md5': '00cdcdea1726abdf500d1e7fd6dd59bb',
'info_dict': {
'id': '433377',
'ext': 'mp4',
@@ -20,15 +24,16 @@ class MusicPlayOnIE(InfoExtractor):
'duration': 342,
'uploader': 'ultrafish',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }
+ }, {
+ 'url': 'http://en.musicplayon.com/play?pl=102&play=442629',
+ 'only_matching': True,
+ }]
+
+ _URL_TEMPLATE = 'http://en.musicplayon.com/play?v=%s'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+ url = self._URL_TEMPLATE % video_id
page = self._download_webpage(url, video_id)
@@ -40,28 +45,14 @@ class MusicPlayOnIE(InfoExtractor):
uploader = self._html_search_regex(
r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
- formats = [
- {
- 'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id,
- 'ext': 'mp4',
- }
- ]
-
- manifest = self._download_webpage(
- 'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest')
-
- for entry in manifest.split('#')[1:]:
- if entry.startswith('EXT-X-STREAM-INF:'):
- meta, url, _ = entry.split('\n')
- params = dict(param.split('=') for param in meta.split(',')[1:])
- formats.append({
- 'url': url,
- 'ext': 'mp4',
- 'tbr': int(params['BANDWIDTH']),
- 'width': int(params['RESOLUTION'].split('x')[1]),
- 'height': int(params['RESOLUTION'].split('x')[-1]),
- 'format_note': params['NAME'].replace('"', '').strip(),
- })
+ sources = self._parse_json(
+ self._search_regex(r'setup\[\'_sources\'\]\s*=\s*([^;]+);', page, 'video sources'),
+ video_id, transform_source=js_to_json)
+ formats = [{
+ 'url': compat_urlparse.urljoin(url, source['src']),
+ 'ext': mimetype2ext(source.get('type')),
+ 'format_note': source.get('data-res'),
+ } for source in sources]
return {
'id': video_id,
diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py
deleted file mode 100644
index 1e9cf8d..0000000
--- a/youtube_dl/extractor/muzu.py
+++ /dev/null
@@ -1,65 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
-
-
-class MuzuTVIE(InfoExtractor):
- _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)'
- IE_NAME = 'muzu.tv'
-
- _TEST = {
- 'url': 'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/',
- 'md5': '98f8b2c7bc50578d6a0364fff2bfb000',
- 'info_dict': {
- 'id': '1981454',
- 'ext': 'mp4',
- 'title': 'Cat Walk (Original Mix)',
- 'description': 'md5:90e868994de201b2570e4e5854e19420',
- 'uploader': 'MarcAshken featuring SOS',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- info_data = compat_urllib_parse.urlencode({
- 'format': 'json',
- 'url': url,
- })
- info = self._download_json(
- 'http://www.muzu.tv/api/oembed/?%s' % info_data,
- video_id, 'Downloading video info')
-
- player_info = self._download_json(
- 'http://player.muzu.tv/player/playerInit?ai=%s' % video_id,
- video_id, 'Downloading player info')
- video_info = player_info['videos'][0]
- for quality in ['1080', '720', '480', '360']:
- if video_info.get('v%s' % quality):
- break
-
- data = compat_urllib_parse.urlencode({
- 'ai': video_id,
- # Even if each time you watch a video the hash changes,
- # it seems to work for different videos, and it will work
- # even if you use any non empty string as a hash
- 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k',
- 'device': 'web',
- 'qv': quality,
- })
- video_url_info = self._download_json(
- 'http://player.muzu.tv/player/requestVideo?%s' % data,
- video_id, 'Downloading video url')
- video_url = video_url_info['url']
-
- return {
- 'id': video_id,
- 'title': info['title'],
- 'url': video_url,
- 'thumbnail': info['thumbnail_url'],
- 'description': info['description'],
- 'uploader': info['author_name'],
- }
diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py
index 66b5231..a103e03 100644
--- a/youtube_dl/extractor/mwave.py
+++ b/youtube_dl/extractor/mwave.py
@@ -10,9 +10,10 @@ from ..utils import (
class MwaveIE(InfoExtractor):
_VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)'
+ _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s'
_TEST = {
'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859',
- 'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc',
+ # md5 is unstable
'info_dict': {
'id': '168859',
'ext': 'flv',
@@ -56,3 +57,28 @@ class MwaveIE(InfoExtractor):
'view_count': int_or_none(vod_info.get('hit')),
'formats': formats,
}
+
+
+class MwaveMeetGreetIE(InfoExtractor):
+ _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://mwave.interest.me/meetgreet/view/256',
+ 'info_dict': {
+ 'id': '173294',
+ 'ext': 'flv',
+ 'title': '[MEET&GREET] Park BoRam',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader': 'Mwave',
+ 'duration': 3634,
+ 'view_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ clip_id = self._html_search_regex(
+ r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)',
+ webpage, 'clip ID')
+ clip_url = MwaveIE._URL_TEMPLATE % clip_id
+ return self.url_result(clip_url, 'Mwave', clip_id)
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
index 83414a2..0d5238d 100644
--- a/youtube_dl/extractor/myspace.py
+++ b/youtube_dl/extractor/myspace.py
@@ -2,13 +2,13 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
-from ..compat import (
- compat_str,
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
)
-from ..utils import ExtractorError
class MySpaceIE(InfoExtractor):
@@ -24,6 +24,8 @@ class MySpaceIE(InfoExtractor):
'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
'uploader': 'Five Minutes to the Stage',
'uploader_id': 'fiveminutestothestage',
+ 'timestamp': 1414108751,
+ 'upload_date': '20141023',
},
'params': {
# rtmp download
@@ -64,7 +66,7 @@ class MySpaceIE(InfoExtractor):
'ext': 'mp4',
'title': 'Starset - First Light',
'description': 'md5:2d5db6c9d11d527683bcda818d332414',
- 'uploader': 'Jacob Soren',
+ 'uploader': 'Yumi K',
'uploader_id': 'SorenPromotions',
'upload_date': '20140725',
}
@@ -78,6 +80,19 @@ class MySpaceIE(InfoExtractor):
player_url = self._search_regex(
r'playerSwf":"([^"?]*)', webpage, 'player URL')
+ def rtmp_format_from_stream_url(stream_url, width=None, height=None):
+ rtmp_url, play_path = stream_url.split(';', 1)
+ return {
+ 'format_id': 'rtmp',
+ 'url': rtmp_url,
+ 'play_path': play_path,
+ 'player_url': player_url,
+ 'protocol': 'rtmp',
+ 'ext': 'flv',
+ 'width': width,
+ 'height': height,
+ }
+
if mobj.group('mediatype').startswith('music/song'):
# songs don't store any useful info in the 'context' variable
song_data = self._search_regex(
@@ -93,8 +108,8 @@ class MySpaceIE(InfoExtractor):
return self._search_regex(
r'''data-%s=([\'"])(?P<data>.*?)\1''' % name,
song_data, name, default='', group='data')
- streamUrl = search_data('stream-url')
- if not streamUrl:
+ stream_url = search_data('stream-url')
+ if not stream_url:
vevo_id = search_data('vevo-id')
youtube_id = search_data('youtube-id')
if vevo_id:
@@ -106,36 +121,47 @@ class MySpaceIE(InfoExtractor):
else:
raise ExtractorError(
'Found song but don\'t know how to download it')
- info = {
+ return {
'id': video_id,
'title': self._og_search_title(webpage),
'uploader': search_data('artist-name'),
'uploader_id': search_data('artist-username'),
'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': int_or_none(search_data('duration')),
+ 'formats': [rtmp_format_from_stream_url(stream_url)]
}
else:
- context = json.loads(self._search_regex(
- r'context = ({.*?});', webpage, 'context'))
- video = context['video']
- streamUrl = video['streamUrl']
- info = {
- 'id': compat_str(video['mediaId']),
+ video = self._parse_json(self._search_regex(
+ r'context = ({.*?});', webpage, 'context'),
+ video_id)['video']
+ formats = []
+ hls_stream_url = video.get('hlsStreamUrl')
+ if hls_stream_url:
+ formats.append({
+ 'format_id': 'hls',
+ 'url': hls_stream_url,
+ 'protocol': 'm3u8_native',
+ 'ext': 'mp4',
+ })
+ stream_url = video.get('streamUrl')
+ if stream_url:
+ formats.append(rtmp_format_from_stream_url(
+ stream_url,
+ int_or_none(video.get('width')),
+ int_or_none(video.get('height'))))
+ self._sort_formats(formats)
+ return {
+ 'id': video_id,
'title': video['title'],
- 'description': video['description'],
- 'thumbnail': video['imageUrl'],
- 'uploader': video['artistName'],
- 'uploader_id': video['artistUsername'],
+ 'description': video.get('description'),
+ 'thumbnail': video.get('imageUrl'),
+ 'uploader': video.get('artistName'),
+ 'uploader_id': video.get('artistUsername'),
+ 'duration': int_or_none(video.get('duration')),
+ 'timestamp': parse_iso8601(video.get('dateAdded')),
+ 'formats': formats,
}
- rtmp_url, play_path = streamUrl.split(';', 1)
- info.update({
- 'url': rtmp_url,
- 'play_path': play_path,
- 'player_url': player_url,
- 'ext': 'flv',
- })
- return info
-
class MySpaceAlbumIE(InfoExtractor):
IE_NAME = 'MySpace:album'
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index f936b92..1ca7b1a 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -11,7 +11,7 @@ from ..utils import (
class MySpassIE(InfoExtractor):
- _VALID_URL = r'http://www\.myspass\.de/.*'
+ _VALID_URL = r'https?://www\.myspass\.de/.*'
_TEST = {
'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
'md5': '0b49f4844a068f8b33f4b7c88405862b',
diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py
index 1e21cf9..6d447a4 100644
--- a/youtube_dl/extractor/myvideo.py
+++ b/youtube_dl/extractor/myvideo.py
@@ -9,8 +9,8 @@ import json
from .common import InfoExtractor
from ..compat import (
compat_ord,
- compat_urllib_parse,
compat_urllib_parse_unquote,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
@@ -20,7 +20,7 @@ from ..utils import (
class MyVideoIE(InfoExtractor):
_WORKING = False
- _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*'
+ _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*'
IE_NAME = 'myvideo'
_TEST = {
'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
@@ -112,7 +112,7 @@ class MyVideoIE(InfoExtractor):
encxml = compat_urllib_parse_unquote(b)
if not params.get('domain'):
params['domain'] = 'www.myvideo.de'
- xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
+ xmldata_url = '%s?%s' % (encxml, compat_urllib_parse_urlencode(params))
if 'flash_playertype=MTV' in xmldata_url:
self._downloader.report_warning('avoiding MTV player')
xmldata_url = (
diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py
index a94ab83..731c245 100644
--- a/youtube_dl/extractor/myvidster.py
+++ b/youtube_dl/extractor/myvidster.py
@@ -4,7 +4,7 @@ from .common import InfoExtractor
class MyVidsterIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/'
+ _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/'
_TEST = {
'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making',
diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py
index 6fc9e7b..7225186 100644
--- a/youtube_dl/extractor/nationalgeographic.py
+++ b/youtube_dl/extractor/nationalgeographic.py
@@ -4,30 +4,40 @@ from .common import InfoExtractor
from ..utils import (
smuggle_url,
url_basename,
+ update_url_query,
)
class NationalGeographicIE(InfoExtractor):
- _VALID_URL = r'http://video\.nationalgeographic\.com/.*?'
+ IE_NAME = 'natgeo'
+ _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?'
_TESTS = [
{
'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo',
+ 'md5': '730855d559abbad6b42c2be1fa584917',
'info_dict': {
- 'id': '4DmDACA6Qtk_',
- 'ext': 'flv',
+ 'id': '0000014b-70a1-dd8c-af7f-f7b559330001',
+ 'ext': 'mp4',
'title': 'Mating Crabs Busted by Sharks',
'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3',
+ 'timestamp': 1423523799,
+ 'upload_date': '20150209',
+ 'uploader': 'NAGS',
},
'add_ie': ['ThePlatform'],
},
{
'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws',
+ 'md5': '6a3105eb448c070503b3105fb9b320b5',
'info_dict': {
- 'id': '_JeBD_D7PlS5',
- 'ext': 'flv',
+ 'id': 'ngc-I0IauNSWznb_UV008GxSbwY35BZvgi2e',
+ 'ext': 'mp4',
'title': 'The Real Jaws',
'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6',
+ 'timestamp': 1433772632,
+ 'upload_date': '20150608',
+ 'uploader': 'NAGS',
},
'add_ie': ['ThePlatform'],
},
@@ -37,18 +47,67 @@ class NationalGeographicIE(InfoExtractor):
name = url_basename(url)
webpage = self._download_webpage(url, name)
- feed_url = self._search_regex(
- r'data-feed-url="([^"]+)"', webpage, 'feed url')
guid = self._search_regex(
r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"',
webpage, 'guid')
- feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name)
- content = feed.find('.//{http://search.yahoo.com/mrss/}content')
- theplatform_id = url_basename(content.attrib.get('url'))
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(
+ 'http://link.theplatform.com/s/ngs/media/guid/2423130747/%s?mbr=true' % guid,
+ {'force_smil_url': True}),
+ 'id': guid,
+ }
- return self.url_result(smuggle_url(
- 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id,
- # For some reason, the normal links don't work and we must force
- # the use of f4m
- {'force_smil_url': True}))
+
+class NationalGeographicChannelIE(InfoExtractor):
+ IE_NAME = 'natgeo:channel'
+ _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)'
+
+ _TESTS = [
+ {
+ 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/',
+ 'md5': '518c9aa655686cf81493af5cc21e2a04',
+ 'info_dict': {
+ 'id': 'nB5vIAfmyllm',
+ 'ext': 'mp4',
+ 'title': 'Uncovering a Universal Knowledge',
+ 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a',
+ 'timestamp': 1458680907,
+ 'upload_date': '20160322',
+ 'uploader': 'NEWA-FNG-NGTV',
+ },
+ 'add_ie': ['ThePlatform'],
+ },
+ {
+ 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/',
+ 'md5': 'c4912f656b4cbe58f3e000c489360989',
+ 'info_dict': {
+ 'id': '3TmMv9OvGwIR',
+ 'ext': 'mp4',
+ 'title': 'The Stunning Red Bird of Paradise',
+ 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c',
+ 'timestamp': 1459362152,
+ 'upload_date': '20160330',
+ 'uploader': 'NEWA-FNG-NGTV',
+ },
+ 'add_ie': ['ThePlatform'],
+ },
+ ]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ release_url = self._search_regex(
+ r'video_auth_playlist_url\s*=\s*"([^"]+)"',
+ webpage, 'release url')
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': smuggle_url(
+ update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}),
+ {'force_smil_url': True}),
+ 'display_id': display_id,
+ }
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py
index 1f5fc21..6d6f69b 100644
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -5,7 +5,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -53,8 +53,8 @@ class NaverIE(InfoExtractor):
raise ExtractorError('couldn\'t extract vid and key')
vid = m_id.group(1)
key = m_id.group(2)
- query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key, })
- query_urls = compat_urllib_parse.urlencode({
+ query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, })
+ query_urls = compat_urllib_parse_urlencode({
'masterVid': vid,
'protocol': 'p2p',
'inKey': key,
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py
index a071378..d896b0d 100644
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -1,18 +1,26 @@
from __future__ import unicode_literals
+import functools
+import os.path
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlencode,
+ compat_urlparse,
+)
from ..utils import (
- parse_duration,
int_or_none,
+ OnDemandPagedList,
+ parse_duration,
+ remove_start,
xpath_text,
xpath_attr,
)
class NBAIE(InfoExtractor):
- _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
+ _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$'
_TESTS = [{
'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
'md5': '9e7729d3010a9c71506fd1248f74e4f4',
@@ -44,14 +52,101 @@ class NBAIE(InfoExtractor):
'timestamp': 1432134543,
'upload_date': '20150520',
}
+ }, {
+ 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake',
+ 'info_dict': {
+ 'id': '1455672027478-Doc_Feb16_720',
+ 'ext': 'mp4',
+ 'title': 'Practice: Doc Rivers - 2/16/16',
+ 'description': 'Head Coach Doc Rivers addresses the media following practice.',
+ 'upload_date': '20160217',
+ 'timestamp': 1455672000,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+ 'info_dict': {
+ 'id': 'timberwolves',
+ 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+ },
+ 'playlist_count': 30,
+ 'params': {
+ # Download the whole playlist takes too long time
+ 'playlist_items': '1-30',
+ },
+ }, {
+ 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#',
+ 'info_dict': {
+ 'id': 'Wigginsmp4',
+ 'ext': 'mp4',
+ 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins',
+ 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.',
+ 'upload_date': '20141212',
+ 'timestamp': 1418418600,
+ },
+ 'params': {
+ 'noplaylist': True,
+ # m3u8 download
+ 'skip_download': True,
+ },
}]
+ _PAGE_SIZE = 30
+
+ def _fetch_page(self, team, video_id, page):
+ search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({
+ 'type': 'teamvideo',
+ 'start': page * self._PAGE_SIZE + 1,
+ 'npp': (page + 1) * self._PAGE_SIZE + 1,
+ 'sort': 'recent',
+ 'output': 'json',
+ 'site': team,
+ })
+ results = self._download_json(
+ search_url, video_id, note='Download page %d of playlist data' % page)['results'][0]
+ for item in results:
+ yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url']))
+
+ def _extract_playlist(self, orig_path, video_id, webpage):
+ team = orig_path.split('/')[0]
+
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video because of --no-playlist')
+ video_path = self._search_regex(
+ r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path')
+ video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path)
+ return self.url_result(video_url)
+
+ self.to_screen('Downloading playlist - add --no-playlist to just download video')
+ playlist_title = self._og_search_title(webpage, fatal=False)
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, team, video_id),
+ self._PAGE_SIZE, use_cache=True)
+
+ return self.playlist_result(entries, team, playlist_title)
+
def _real_extract(self, url):
path, video_id = re.match(self._VALID_URL, url).groups()
+ orig_path = path
if path.startswith('nba/'):
path = path[3:]
+
+ if 'video/' not in path:
+ webpage = self._download_webpage(url, video_id)
+ path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/')
+
+ if path == '{{id}}':
+ return self._extract_playlist(orig_path, video_id, webpage)
+
+ # See prepareContentId() of pkgCvp.js
+ if path.startswith('video/teams'):
+ path = 'video/channels/proxy/' + path[6:]
+
video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id)
- video_id = xpath_text(video_info, 'slug')
+ video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0]
title = xpath_text(video_info, 'headline')
description = xpath_text(video_info, 'description')
duration = parse_duration(xpath_text(video_info, 'length'))
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 2202cfa..f694e21 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -3,9 +3,8 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from .theplatform import ThePlatformIE
from ..utils import (
- ExtractorError,
find_xpath_attr,
lowercase_escape,
smuggle_url,
@@ -24,6 +23,9 @@ class NBCIE(InfoExtractor):
'ext': 'mp4',
'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s',
'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.',
+ 'timestamp': 1424246400,
+ 'upload_date': '20150218',
+ 'uploader': 'NBCU-COM',
},
'params': {
# m3u8 download
@@ -47,6 +49,9 @@ class NBCIE(InfoExtractor):
'ext': 'mp4',
'title': 'Star Wars Teaser',
'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442',
+ 'timestamp': 1417852800,
+ 'upload_date': '20141206',
+ 'uploader': 'NBCU-COM',
},
'params': {
# m3u8 download
@@ -58,6 +63,23 @@ class NBCIE(InfoExtractor):
# This video has expired but with an escaped embedURL
'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
'only_matching': True,
+ },
+ {
+ # HLS streams requires the 'hdnea3' cookie
+ 'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
+ 'info_dict': {
+ 'id': 'n1806',
+ 'ext': 'mp4',
+ 'title': 'Goliath',
+ 'description': 'When an unknown soldier saves the life of the King\'s son in battle, he\'s thrust into the limelight and politics of the kingdom.',
+ 'timestamp': 1237100400,
+ 'upload_date': '20090315',
+ 'uploader': 'NBCU-COM',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Only works from US',
}
]
@@ -75,6 +97,7 @@ class NBCIE(InfoExtractor):
theplatform_url = 'http:' + theplatform_url
return {
'_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
'url': smuggle_url(theplatform_url, {'source_url': url}),
'id': video_id,
}
@@ -90,6 +113,9 @@ class NBCSportsVPlayerIE(InfoExtractor):
'ext': 'flv',
'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ 'timestamp': 1426270238,
+ 'upload_date': '20150313',
+ 'uploader': 'NBCU-SPORTS',
}
}, {
'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
@@ -112,7 +138,7 @@ class NBCSportsVPlayerIE(InfoExtractor):
class NBCSportsIE(InfoExtractor):
# Does not include https because its certificate is invalid
- _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+ _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
_TEST = {
'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
@@ -121,6 +147,9 @@ class NBCSportsIE(InfoExtractor):
'ext': 'flv',
'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+ 'uploader': 'NBCU-SPORTS',
+ 'upload_date': '20150330',
+ 'timestamp': 1427726529,
}
}
@@ -131,10 +160,37 @@ class NBCSportsIE(InfoExtractor):
NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
-class NBCNewsIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
+class CSNNEIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.csnne\.com/video/(?P<id>[0-9a-z-]+)'
+
+ _TEST = {
+ 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter',
+ 'info_dict': {
+ 'id': 'yvBLLUgQ8WU0',
+ 'ext': 'mp4',
+ 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.',
+ 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3',
+ 'timestamp': 1459369979,
+ 'upload_date': '20160330',
+ 'uploader': 'NBCU-SPORTS',
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
+ 'url': self._html_search_meta('twitter:player:stream', webpage),
+ 'display_id': display_id,
+ }
+
+
+class NBCNewsIE(ThePlatformIE):
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/
(?:video/.+?/(?P<id>\d+)|
- (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+))
+ ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+))
'''
_TESTS = [
@@ -149,40 +205,92 @@ class NBCNewsIE(InfoExtractor):
},
},
{
- 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
- 'md5': 'b2421750c9f260783721d898f4c42063',
+ 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
+ 'md5': 'af1adfa51312291a017720403826bb64',
'info_dict': {
- 'id': 'I1wpAI_zmhsQ',
+ 'id': '269389891880',
'ext': 'mp4',
'title': 'How Twitter Reacted To The Snowden Interview',
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
+ 'uploader': 'NBCU-NEWS',
+ 'timestamp': 1401363060,
+ 'upload_date': '20140529',
},
- 'add_ie': ['ThePlatform'],
},
{
'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
'md5': 'fdbf39ab73a72df5896b6234ff98518a',
'info_dict': {
- 'id': 'Wjf9EDR3A_60',
+ 'id': '529953347624',
'ext': 'mp4',
'title': 'FULL EPISODE: Family Business',
'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
},
+ 'skip': 'This page is unavailable.',
},
{
'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
- 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d',
+ 'md5': '73135a2e0ef819107bbb55a5a9b2a802',
'info_dict': {
- 'id': 'sekXqyTVnmN3',
+ 'id': '394064451844',
'ext': 'mp4',
'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
+ 'timestamp': 1423104900,
+ 'uploader': 'NBCU-NEWS',
+ 'upload_date': '20150205',
+ },
+ },
+ {
+ 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
+ 'md5': 'a49e173825e5fcd15c13fc297fced39d',
+ 'info_dict': {
+ 'id': '529953347624',
+ 'ext': 'mp4',
+ 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up',
+ 'description': 'md5:c8be487b2d80ff0594c005add88d8351',
+ 'upload_date': '20150922',
+ 'timestamp': 1442917800,
+ 'uploader': 'NBCU-NEWS',
+ },
+ },
+ {
+ 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
+ 'md5': '118d7ca3f0bea6534f119c68ef539f71',
+ 'info_dict': {
+ 'id': '669831235788',
+ 'ext': 'mp4',
+ 'title': 'See the aurora borealis from space in stunning new NASA video',
+ 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
+ 'upload_date': '20160420',
+ 'timestamp': 1461152093,
+ 'uploader': 'NBCU-NEWS',
+ },
+ },
+ {
+ 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
+ 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
+ 'info_dict': {
+ 'id': '314487875924',
+ 'ext': 'mp4',
+ 'title': 'The chaotic GOP immigration vote',
+ 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1406937606,
+ 'upload_date': '20140802',
+ 'uploader': 'NBCU-NEWS',
+ 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
},
},
{
'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
'only_matching': True,
},
+ {
+ # From http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html
+ 'url': 'http://www.nbcnews.com/widget/video-embed/701714499682',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -202,72 +310,28 @@ class NBCNewsIE(InfoExtractor):
}
else:
# "feature" and "nightly-news" pages use theplatform.com
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- bootstrap_json = self._search_regex(
- r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
- webpage, 'bootstrap json', flags=re.MULTILINE)
- bootstrap = self._parse_json(bootstrap_json, video_id)
- info = bootstrap['results'][0]['video']
- mpxid = info['mpxId']
-
- base_urls = [
- info['fallbackPlaylistUrl'],
- info['associatedPlaylistUrl'],
- ]
-
- for base_url in base_urls:
- if not base_url:
- continue
- playlist_url = base_url + '?form=MPXNBCNewsAPI'
-
- try:
- all_videos = self._download_json(playlist_url, title)
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError):
- continue
- raise
-
- if not all_videos or 'videos' not in all_videos:
- continue
-
- try:
- info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid)
- break
- except StopIteration:
- continue
-
- if info is None:
- raise ExtractorError('Could not find video in playlists')
+ video_id = mobj.group('mpx_id')
+ if not video_id.isdigit():
+ webpage = self._download_webpage(url, video_id)
+ info = None
+ bootstrap_json = self._search_regex(
+ [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
+ r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'],
+ webpage, 'bootstrap json', default=None)
+ bootstrap = self._parse_json(
+ bootstrap_json, video_id, transform_source=unescapeHTML)
+ if 'results' in bootstrap:
+ info = bootstrap['results'][0]['video']
+ elif 'video' in bootstrap:
+ info = bootstrap['video']
+ else:
+ info = bootstrap
+ video_id = info['mpxId']
return {
- '_type': 'url',
- # We get the best quality video
- 'url': info['videoAssets'][-1]['publicUrl'],
- 'ie_key': 'ThePlatform',
+ '_type': 'url_transparent',
+ 'id': video_id,
+ # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
+ 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id,
+ 'ie_key': 'ThePlatformFeed',
}
-
-
-class MSNBCIE(InfoExtractor):
- # https URLs redirect to corresponding http ones
- _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)'
- _TEST = {
- 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
- 'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
- 'info_dict': {
- 'id': 'n_hayes_Aimm_140801_272214',
- 'ext': 'mp4',
- 'title': 'The chaotic GOP immigration vote',
- 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'timestamp': 1406937606,
- 'upload_date': '20140802',
- 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'],
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- embed_url = self._html_search_meta('embedURL', webpage)
- return self.url_result(embed_url)
diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py
index 2a1ca80..96528f6 100644
--- a/youtube_dl/extractor/ndtv.py
+++ b/youtube_dl/extractor/ndtv.py
@@ -1,19 +1,18 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
- month_by_name,
int_or_none,
+ remove_end,
+ unified_strdate,
)
class NDTVIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710',
+ 'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710',
'md5': '39f992dbe5fb531c395d8bbedb1e5e88',
'info_dict': {
'id': '300710',
@@ -22,7 +21,7 @@ class NDTVIE(InfoExtractor):
'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02',
'upload_date': '20131208',
'duration': 1327,
- 'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg',
+ 'thumbnail': 're:https?://.*\.jpg',
},
}
@@ -30,36 +29,19 @@ class NDTVIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+ title = remove_end(self._og_search_title(webpage), ' - NDTV')
+
filename = self._search_regex(
r"__filename='([^']+)'", webpage, 'video filename')
- video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' %
- filename)
+ video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename
duration = int_or_none(self._search_regex(
r"__duration='([^']+)'", webpage, 'duration', fatal=False))
- date_m = re.search(r'''(?x)
- <p\s+class="vod_dateline">\s*
- Published\s+On:\s*
- (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
- ''', webpage)
- upload_date = None
-
- if date_m is not None:
- month = month_by_name(date_m.group('monthname'))
- if month is not None:
- upload_date = '%s%02d%02d' % (
- date_m.group('year'), month, int(date_m.group('day')))
-
- description = self._og_search_description(webpage)
- READ_MORE = ' (Read more)'
- if description.endswith(READ_MORE):
- description = description[:-len(READ_MORE)]
+ upload_date = unified_strdate(self._html_search_meta(
+ 'publish-date', webpage, 'upload date', fatal=False))
- title = self._og_search_title(webpage)
- TITLE_SUFFIX = ' - NDTV'
- if title.endswith(TITLE_SUFFIX):
- title = title[:-len(TITLE_SUFFIX)]
+ description = remove_end(self._og_search_description(webpage), ' (Read more)')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/nerdist.py b/youtube_dl/extractor/nerdist.py
deleted file mode 100644
index c6dc34b..0000000
--- a/youtube_dl/extractor/nerdist.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# encoding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-
-from ..utils import (
- determine_ext,
- parse_iso8601,
- xpath_text,
-)
-
-
-class NerdistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nerdist\.com/vepisode/(?P<id>[^/?#]+)'
- _TEST = {
- 'url': 'http://www.nerdist.com/vepisode/exclusive-which-dc-characters-w',
- 'md5': '3698ed582931b90d9e81e02e26e89f23',
- 'info_dict': {
- 'display_id': 'exclusive-which-dc-characters-w',
- 'id': 'RPHpvJyr',
- 'ext': 'mp4',
- 'title': 'Your TEEN TITANS Revealed! Who\'s on the show?',
- 'thumbnail': 're:^https?://.*/thumbs/.*\.jpg$',
- 'description': 'Exclusive: Find out which DC Comics superheroes will star in TEEN TITANS Live-Action TV Show on Nerdist News with Jessica Chobot!',
- 'uploader': 'Eric Diaz',
- 'upload_date': '20150202',
- 'timestamp': 1422892808,
- }
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- video_id = self._search_regex(
- r'''(?x)<script\s+(?:type="text/javascript"\s+)?
- src="https?://content\.nerdist\.com/players/([a-zA-Z0-9_]+)-''',
- webpage, 'video ID')
- timestamp = parse_iso8601(self._html_search_meta(
- 'shareaholic:article_published_time', webpage, 'upload date'))
- uploader = self._html_search_meta(
- 'shareaholic:article_author_name', webpage, 'article author')
-
- doc = self._download_xml(
- 'http://content.nerdist.com/jw6/%s.xml' % video_id, video_id)
- video_info = doc.find('.//item')
- title = xpath_text(video_info, './title', fatal=True)
- description = xpath_text(video_info, './description')
- thumbnail = xpath_text(
- video_info, './{http://rss.jwpcdn.com/}image', 'thumbnail')
-
- formats = []
- for source in video_info.findall('./{http://rss.jwpcdn.com/}source'):
- vurl = source.attrib['file']
- ext = determine_ext(vurl)
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- vurl, video_id, entry_protocol='m3u8_native', ext='mp4',
- preference=0))
- elif ext == 'smil':
- formats.extend(self._extract_smil_formats(
- vurl, video_id, fatal=False
- ))
- else:
- formats.append({
- 'format_id': ext,
- 'url': vurl,
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'formats': formats,
- 'uploader': uploader,
- }
diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py
index 7830616..978a058 100644
--- a/youtube_dl/extractor/neteasemusic.py
+++ b/youtube_dl/extractor/neteasemusic.py
@@ -8,7 +8,7 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_str,
compat_itertools_count,
)
@@ -89,6 +89,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'timestamp': 1431878400,
'description': 'md5:a10a54589c2860300d02e1de821eb2ef',
},
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'No lyrics translation.',
'url': 'http://music.163.com/#/song?id=29822014',
@@ -101,6 +102,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'timestamp': 1419523200,
'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c',
},
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'No lyrics.',
'url': 'http://music.163.com/song?id=17241424',
@@ -112,6 +114,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'upload_date': '20080211',
'timestamp': 1202745600,
},
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'Has translated name.',
'url': 'http://music.163.com/#/song?id=22735043',
@@ -124,7 +127,8 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'upload_date': '20100127',
'timestamp': 1264608000,
'alt_title': '说出愿望吧(Genie)',
- }
+ },
+ 'skip': 'Blocked outside Mainland China',
}]
def _process_lyrics(self, lyrics_info):
@@ -153,7 +157,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'ids': '[%s]' % song_id
}
info = self.query_api(
- 'song/detail?' + compat_urllib_parse.urlencode(params),
+ 'song/detail?' + compat_urllib_parse_urlencode(params),
song_id, 'Downloading song info')['songs'][0]
formats = self.extract_formats(info)
@@ -192,6 +196,7 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
'title': 'B\'day',
},
'playlist_count': 23,
+ 'skip': 'Blocked outside Mainland China',
}
def _real_extract(self, url):
@@ -223,6 +228,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
'title': '张惠妹 - aMEI;阿密特',
},
'playlist_count': 50,
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'Singer has translated name.',
'url': 'http://music.163.com/#/artist?id=124098',
@@ -231,6 +237,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE):
'title': '李昇基 - 이승기',
},
'playlist_count': 50,
+ 'skip': 'Blocked outside Mainland China',
}]
def _real_extract(self, url):
@@ -266,6 +273,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE):
'description': 'md5:12fd0819cab2965b9583ace0f8b7b022'
},
'playlist_count': 99,
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'Toplist/Charts sample',
'url': 'http://music.163.com/#/discover/toplist?id=3733003',
@@ -275,6 +283,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE):
'description': 'md5:73ec782a612711cadc7872d9c1e134fc',
},
'playlist_count': 50,
+ 'skip': 'Blocked outside Mainland China',
}]
def _real_extract(self, url):
@@ -314,6 +323,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE):
'creator': '白雅言',
'upload_date': '20150520',
},
+ 'skip': 'Blocked outside Mainland China',
}
def _real_extract(self, url):
@@ -357,6 +367,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
'upload_date': '20150613',
'duration': 900,
},
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'This program has accompanying songs.',
'url': 'http://music.163.com/#/program?id=10141022',
@@ -366,6 +377,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b',
},
'playlist_count': 4,
+ 'skip': 'Blocked outside Mainland China',
}, {
'note': 'This program has accompanying songs.',
'url': 'http://music.163.com/#/program?id=10141022',
@@ -379,7 +391,8 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE):
},
'params': {
'noplaylist': True
- }
+ },
+ 'skip': 'Blocked outside Mainland China',
}]
def _real_extract(self, url):
@@ -438,6 +451,7 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE):
'description': 'md5:766220985cbd16fdd552f64c578a6b15'
},
'playlist_mincount': 40,
+ 'skip': 'Blocked outside Mainland China',
}
_PAGE_SIZE = 1000
diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py
index cd117b0..7059403 100644
--- a/youtube_dl/extractor/newgrounds.py
+++ b/youtube_dl/extractor/newgrounds.py
@@ -7,8 +7,8 @@ from .common import InfoExtractor
class NewgroundsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/audio/listen/(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
+ _TESTS = [{
'url': 'http://www.newgrounds.com/audio/listen/549479',
'md5': 'fe6033d297591288fa1c1f780386f07a',
'info_dict': {
@@ -17,7 +17,16 @@ class NewgroundsIE(InfoExtractor):
'title': 'B7 - BusMode',
'uploader': 'Burn7',
}
- }
+ }, {
+ 'url': 'http://www.newgrounds.com/portal/view/673111',
+ 'md5': '3394735822aab2478c31b1004fe5e5bc',
+ 'info_dict': {
+ 'id': '673111',
+ 'ext': 'mp4',
+ 'title': 'Dancin',
+ 'uploader': 'Squirrelman82',
+ },
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -25,9 +34,11 @@ class NewgroundsIE(InfoExtractor):
webpage = self._download_webpage(url, music_id)
title = self._html_search_regex(
- r',"name":"([^"]+)",', webpage, 'music title')
+ r'<title>([^>]+)</title>', webpage, 'title')
+
uploader = self._html_search_regex(
- r',"artist":"([^"]+)",', webpage, 'music uploader')
+ [r',"artist":"([^"]+)",', r'[\'"]owner[\'"]\s*:\s*[\'"]([^\'"]+)[\'"],'],
+ webpage, 'uploader')
music_url_json_string = self._html_search_regex(
r'({"url":"[^"]+"),', webpage, 'music url') + '}'
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
index 5a9e73c..0092b85 100644
--- a/youtube_dl/extractor/newstube.py
+++ b/youtube_dl/extractor/newstube.py
@@ -4,24 +4,24 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+)
class NewstubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'
_TEST = {
'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym',
+ 'md5': '801eef0c2a9f4089fa04e4fe3533abdc',
'info_dict': {
'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Телеканал CNN переместил город Славянск в Крым',
'description': 'md5:419a8c9f03442bc0b0a794d689360335',
'duration': 31.05,
},
- 'params': {
- # rtmp download
- 'skip_download': True,
- },
}
def _real_extract(self, url):
@@ -62,7 +62,6 @@ class NewstubeIE(InfoExtractor):
server = media_location.find(ns('./Server')).text
app = media_location.find(ns('./App')).text
media_id = stream_info.find(ns('./Id')).text
- quality_id = stream_info.find(ns('./QualityId')).text
name = stream_info.find(ns('./Name')).text
width = int(stream_info.find(ns('./Width')).text)
height = int(stream_info.find(ns('./Height')).text)
@@ -74,12 +73,38 @@ class NewstubeIE(InfoExtractor):
'rtmp_conn': ['S:%s' % session_id, 'S:%s' % media_id, 'S:n2'],
'page_url': url,
'ext': 'flv',
- 'format_id': quality_id,
- 'format_note': name,
+ 'format_id': 'rtmp' + ('-%s' % name if name else ''),
'width': width,
'height': height,
})
+ sources_data = self._download_json(
+ 'http://www.newstube.ru/player2/getsources?guid=%s' % video_guid,
+ video_guid, fatal=False)
+ if sources_data:
+ for source in sources_data.get('Sources', []):
+ source_url = source.get('Src')
+ if not source_url:
+ continue
+ height = int_or_none(source.get('Height'))
+ f = {
+ 'format_id': 'http' + ('-%dp' % height if height else ''),
+ 'url': source_url,
+ 'width': int_or_none(source.get('Width')),
+ 'height': height,
+ }
+ source_type = source.get('Type')
+ if source_type:
+ mobj = re.search(r'codecs="([^,]+),\s*([^"]+)"', source_type)
+ if mobj:
+ vcodec, acodec = mobj.groups()
+ f.update({
+ 'vcodec': vcodec,
+ 'acodec': acodec,
+ })
+ formats.append(f)
+
+ self._check_formats(formats, video_guid)
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py
index d168845..aae7aee 100644
--- a/youtube_dl/extractor/nextmedia.py
+++ b/youtube_dl/extractor/nextmedia.py
@@ -7,7 +7,7 @@ from ..utils import parse_iso8601
class NextMediaIE(InfoExtractor):
IE_DESC = '蘋果日報'
- _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199',
'md5': 'dff9fad7009311c421176d1ac90bfe4f',
@@ -68,7 +68,7 @@ class NextMediaIE(InfoExtractor):
class NextMediaActionNewsIE(NextMediaIE):
IE_DESC = '蘋果日報 - 動新聞'
- _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
+ _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+'
_TESTS = [{
'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460',
'md5': '05fce8ffeed7a5e00665d4b7cf0f9201',
@@ -93,7 +93,7 @@ class NextMediaActionNewsIE(NextMediaIE):
class AppleDailyIE(NextMediaIE):
IE_DESC = '臺灣蘋果日報'
- _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+ _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
_TESTS = [{
'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py
index 657ae77..9ccd7d7 100644
--- a/youtube_dl/extractor/nextmovie.py
+++ b/youtube_dl/extractor/nextmovie.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
class NextMovieIE(MTVServicesInfoExtractor):
@@ -20,7 +20,7 @@ class NextMovieIE(MTVServicesInfoExtractor):
}]
def _get_feed_query(self, uri):
- return compat_urllib_parse.urlencode({
+ return compat_urllib_parse_urlencode({
'feed': '1505',
'mgid': uri,
})
diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py
index 5bd15f7..adcc636 100644
--- a/youtube_dl/extractor/nfb.py
+++ b/youtube_dl/extractor/nfb.py
@@ -1,8 +1,14 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
-from ..utils import sanitized_Request
+from ..utils import (
+ clean_html,
+ determine_ext,
+ int_or_none,
+ qualities,
+ urlencode_postdata,
+ xpath_text,
+)
class NFBIE(InfoExtractor):
@@ -14,12 +20,12 @@ class NFBIE(InfoExtractor):
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': {
'id': 'qallunaat_why_white_people_are_funny',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Qallunaat! Why White People Are Funny ',
- 'description': 'md5:836d8aff55e087d04d9f6df554d4e038',
+ 'description': 'md5:6b8e32dde3abf91e58857b174916620c',
'duration': 3128,
+ 'creator': 'Mark Sandiford',
'uploader': 'Mark Sandiford',
- 'uploader_id': 'mark-sandiford',
},
'params': {
# rtmp download
@@ -29,65 +35,78 @@ class NFBIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- page = self._download_webpage(
- 'https://www.nfb.ca/film/%s' % video_id, video_id,
- 'Downloading film page')
- uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"',
- page, 'director id', fatal=False)
- uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
- page, 'director name', fatal=False)
-
- request = sanitized_Request(
+ config = self._download_xml(
'https://www.nfb.ca/film/%s/player_config' % video_id,
- compat_urllib_parse.urlencode({'getConfig': 'true'}).encode('ascii'))
- request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf')
-
- config = self._download_xml(request, video_id, 'Downloading player config XML')
+ video_id, 'Downloading player config XML',
+ data=urlencode_postdata({'getConfig': 'true'}),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
+ })
- title = None
- description = None
- thumbnail = None
- duration = None
- formats = []
-
- def extract_thumbnail(media):
- thumbnails = {}
- for asset in media.findall('assets/asset'):
- thumbnails[asset.get('quality')] = asset.find('default/url').text
- if not thumbnails:
- return None
- if 'high' in thumbnails:
- return thumbnails['high']
- return list(thumbnails.values())[0]
+ title, description, thumbnail, duration, uploader, author = [None] * 6
+ thumbnails, formats = [[]] * 2
+ subtitles = {}
for media in config.findall('./player/stream/media'):
if media.get('type') == 'posterImage':
- thumbnail = extract_thumbnail(media)
+ quality_key = qualities(('low', 'high'))
+ thumbnails = []
+ for asset in media.findall('assets/asset'):
+ asset_url = xpath_text(asset, 'default/url', default=None)
+ if not asset_url:
+ continue
+ quality = asset.get('quality')
+ thumbnails.append({
+ 'url': asset_url,
+ 'id': quality,
+ 'preference': quality_key(quality),
+ })
elif media.get('type') == 'video':
- duration = int(media.get('duration'))
- title = media.find('title').text
- description = media.find('description').text
- # It seems assets always go from lower to better quality, so no need to sort
+ title = xpath_text(media, 'title', fatal=True)
for asset in media.findall('assets/asset'):
- for x in asset:
+ quality = asset.get('quality')
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', quality or '', 'height', default=None))
+ for node in asset:
+ streamer = xpath_text(node, 'streamerURI', default=None)
+ if not streamer:
+ continue
+ play_path = xpath_text(node, 'url', default=None)
+ if not play_path:
+ continue
formats.append({
- 'url': x.find('streamerURI').text,
- 'app': x.find('streamerURI').text.split('/', 3)[3],
- 'play_path': x.find('url').text,
+ 'url': streamer,
+ 'app': streamer.split('/', 3)[3],
+ 'play_path': play_path,
'rtmp_live': False,
- 'ext': 'mp4',
- 'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+ 'ext': 'flv',
+ 'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
+ 'height': height,
})
+ self._sort_formats(formats)
+ description = clean_html(xpath_text(media, 'description'))
+ uploader = xpath_text(media, 'author')
+ duration = int_or_none(media.get('duration'))
+ for subtitle in media.findall('./subtitles/subtitle'):
+ subtitle_url = xpath_text(subtitle, 'url', default=None)
+ if not subtitle_url:
+ continue
+ lang = xpath_text(subtitle, 'lang', default='en')
+ subtitles.setdefault(lang, []).append({
+ 'url': subtitle_url,
+ 'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
+ })
return {
'id': video_id,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'duration': duration,
+ 'creator': uploader,
'uploader': uploader,
- 'uploader_id': uploader_id,
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py
index 8d5ce46..b04d211 100644
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -7,11 +7,16 @@ import os
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
- compat_urllib_parse,
- compat_urllib_parse_urlparse
+ compat_urllib_parse_urlencode,
+ compat_urllib_parse_urlparse,
+ compat_str,
)
from ..utils import (
unified_strdate,
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ parse_duration,
)
@@ -38,7 +43,7 @@ class NHLBaseInfoExtractor(InfoExtractor):
parsed_url = compat_urllib_parse_urlparse(initial_video_url)
filename, ext = os.path.splitext(parsed_url.path)
path = '%s_sd%s' % (filename, ext)
- data = compat_urllib_parse.urlencode({
+ data = compat_urllib_parse_urlencode({
'type': 'fvod',
'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:])
})
@@ -70,8 +75,8 @@ class NHLBaseInfoExtractor(InfoExtractor):
return ret
-class NHLIE(NHLBaseInfoExtractor):
- IE_NAME = 'nhl.com'
+class NHLVideocenterIE(NHLBaseInfoExtractor):
+ IE_NAME = 'nhl.com:videocenter'
_VALID_URL = r'https?://video(?P<team>\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P<id>[-0-9a-zA-Z,]+)'
_TESTS = [{
@@ -186,8 +191,8 @@ class NHLNewsIE(NHLBaseInfoExtractor):
return self._real_extract_video(video_id)
-class NHLVideocenterIE(NHLBaseInfoExtractor):
- IE_NAME = 'nhl.com:videocenter'
+class NHLVideocenterCategoryIE(NHLBaseInfoExtractor):
+ IE_NAME = 'nhl.com:videocenter:category'
IE_DESC = 'NHL videocenter category'
_VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P<catid>[0-9]+)(?![&?]id=).*?)?$'
_TEST = {
@@ -211,7 +216,7 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
r'tab0"[^>]*?>(.*?)</td>',
webpage, 'playlist title', flags=re.DOTALL).lower().capitalize()
- data = compat_urllib_parse.urlencode({
+ data = compat_urllib_parse_urlencode({
'cid': cat_id,
# This is the default value
'count': 12,
@@ -236,3 +241,86 @@ class NHLVideocenterIE(NHLBaseInfoExtractor):
'id': cat_id,
'entries': [self._extract_video(v) for v in videos],
}
+
+
+class NHLIE(InfoExtractor):
+ IE_NAME = 'nhl.com'
+ _VALID_URL = r'https?://(?:www\.)?nhl\.com/([^/]+/)*c-(?P<id>\d+)'
+ _TESTS = [{
+ # type=video
+ 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503',
+ 'md5': '0f7b9a8f986fb4b4eeeece9a56416eaf',
+ 'info_dict': {
+ 'id': '43663503',
+ 'ext': 'mp4',
+ 'title': 'Anisimov cleans up mess',
+ 'description': 'md5:a02354acdfe900e940ce40706939ca63',
+ 'timestamp': 1461288600,
+ 'upload_date': '20160422',
+ },
+ }, {
+ # type=article
+ 'url': 'https://www.nhl.com/news/dennis-wideman-suspended/c-278258934',
+ 'md5': '1f39f4ea74c1394dea110699a25b366c',
+ 'info_dict': {
+ 'id': '40784403',
+ 'ext': 'mp4',
+ 'title': 'Wideman suspended by NHL',
+ 'description': 'Flames defenseman Dennis Wideman was banned 20 games for violation of Rule 40 (Physical Abuse of Officials)',
+ 'upload_date': '20160204',
+ 'timestamp': 1454544904,
+ },
+ }]
+
+ def _real_extract(self, url):
+ tmp_id = self._match_id(url)
+ video_data = self._download_json(
+ 'https://nhl.bamcontent.com/nhl/id/v1/%s/details/web-v1.json' % tmp_id,
+ tmp_id)
+ if video_data.get('type') == 'article':
+ video_data = video_data['media']
+
+ video_id = compat_str(video_data['id'])
+ title = video_data['title']
+
+ formats = []
+ for playback in video_data.get('playbacks', []):
+ playback_url = playback.get('url')
+ if not playback_url:
+ continue
+ ext = determine_ext(playback_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ playback_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id=playback.get('name', 'hls'), fatal=False))
+ else:
+ height = int_or_none(playback.get('height'))
+ formats.append({
+ 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')),
+ 'url': playback_url,
+ 'width': int_or_none(playback.get('width')),
+ 'height': height,
+ })
+ self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id'))
+
+ thumbnails = []
+ for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items():
+ thumbnail_url = thumbnail_data.get('src')
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ 'width': int_or_none(thumbnail_data.get('width')),
+ 'height': int_or_none(thumbnail_data.get('height')),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'timestamp': parse_iso8601(video_data.get('date')),
+ 'duration': parse_duration(video_data.get('duration')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py
index b62819a..e960137 100644
--- a/youtube_dl/extractor/nick.py
+++ b/youtube_dl/extractor/nick.py
@@ -2,7 +2,8 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
+from ..utils import update_url_query
class NickIE(MTVServicesInfoExtractor):
@@ -54,10 +55,33 @@ class NickIE(MTVServicesInfoExtractor):
}]
def _get_feed_query(self, uri):
- return compat_urllib_parse.urlencode({
+ return compat_urllib_parse_urlencode({
'feed': 'nick_arc_player_prime',
'mgid': uri,
})
def _extract_mgid(self, webpage):
return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid')
+
+
+class NickDeIE(MTVServicesInfoExtractor):
+ IE_NAME = 'nick.de'
+ _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nick.de/shows/342-icarly',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ mrss_url = update_url_query(self._search_regex(
+ r'data-mrss=(["\'])(?P<url>http.+?)\1', webpage, 'mrss url', group='url'),
+ {'siteKey': 'nick.de'})
+
+ return self._get_videos_info_from_url(mrss_url, video_id)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index 586e52a..dd75a48 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -7,11 +7,10 @@ import datetime
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
- encode_dict,
ExtractorError,
int_or_none,
parse_duration,
@@ -19,6 +18,7 @@ from ..utils import (
sanitized_Request,
xpath_text,
determine_ext,
+ urlencode_postdata,
)
@@ -101,7 +101,7 @@ class NiconicoIE(InfoExtractor):
'mail': username,
'password': password,
}
- login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('utf-8')
+ login_data = urlencode_postdata(login_form_strs)
request = sanitized_Request(
'https://secure.nicovideo.jp/secure/login', login_data)
login_results = self._download_webpage(
@@ -141,7 +141,7 @@ class NiconicoIE(InfoExtractor):
r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey')
# Get flv info
- flv_info_data = compat_urllib_parse.urlencode({
+ flv_info_data = compat_urllib_parse_urlencode({
'k': thumb_play_key,
'v': video_id
})
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
index d440313..06f2bda 100644
--- a/youtube_dl/extractor/noco.py
+++ b/youtube_dl/extractor/noco.py
@@ -8,7 +8,6 @@ import hashlib
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urllib_parse,
compat_urlparse,
)
from ..utils import (
@@ -18,11 +17,12 @@ from ..utils import (
float_or_none,
parse_iso8601,
sanitized_Request,
+ urlencode_postdata,
)
class NocoIE(InfoExtractor):
- _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
_LOGIN_URL = 'http://noco.tv/do.php'
_API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s'
_SUB_LANG_TEMPLATE = '&sub_lang=%s'
@@ -75,7 +75,7 @@ class NocoIE(InfoExtractor):
'username': username,
'password': password,
}
- request = sanitized_Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+ request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded; charset=UTF-8')
login = self._download_json(request, None, 'Logging in as %s' % username)
diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py
index 5952d13..af44c3b 100644
--- a/youtube_dl/extractor/normalboots.py
+++ b/youtube_dl/extractor/normalboots.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from .screenwavemedia import ScreenwaveMediaIE
from ..utils import (
unified_strdate,
@@ -9,10 +10,9 @@ from ..utils import (
class NormalbootsIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$'
+ _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$'
_TEST = {
'url': 'http://normalboots.com/video/home-alone-games-jontron/',
- 'md5': '8bf6de238915dd501105b44ef5f1e0f6',
'info_dict': {
'id': 'home-alone-games-jontron',
'ext': 'mp4',
@@ -22,9 +22,10 @@ class NormalbootsIE(InfoExtractor):
'upload_date': '20140125',
},
'params': {
- # rtmp download
+ # m3u8 download
'skip_download': True,
},
+ 'add_ie': ['ScreenwaveMedia'],
}
def _real_extract(self, url):
@@ -38,16 +39,15 @@ class NormalbootsIE(InfoExtractor):
r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
webpage, 'date', fatal=False))
- player_url = self._html_search_regex(
- r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"',
- webpage, 'player url')
- player_page = self._download_webpage(player_url, video_id)
- video_url = self._html_search_regex(
- r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
+ screenwavemedia_url = self._html_search_regex(
+ ScreenwaveMediaIE.EMBED_PATTERN, webpage, 'screenwave URL',
+ group='url')
return {
+ '_type': 'url_transparent',
'id': video_id,
- 'url': video_url,
+ 'url': screenwavemedia_url,
+ 'ie_key': ScreenwaveMediaIE.ie_key(),
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py
index 3f9c776..17671ad 100644
--- a/youtube_dl/extractor/nova.py
+++ b/youtube_dl/extractor/nova.py
@@ -12,7 +12,7 @@ from ..utils import (
class NovaIE(InfoExtractor):
IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
- _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
+ _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
_TESTS = [{
'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
'info_dict': {
diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py
index d68c1ad..3bbd473 100644
--- a/youtube_dl/extractor/novamov.py
+++ b/youtube_dl/extractor/novamov.py
@@ -7,7 +7,6 @@ from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
NO_DEFAULT,
- encode_dict,
sanitized_Request,
urlencode_postdata,
)
@@ -17,7 +16,14 @@ class NovaMovIE(InfoExtractor):
IE_NAME = 'novamov'
IE_DESC = 'NovaMov'
- _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video|mobile/#/videos)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})'
+ _VALID_URL_TEMPLATE = r'''(?x)
+ http://
+ (?:
+ (?:www\.)?%(host)s/(?:file|video|mobile/\#/videos)/|
+ (?:(?:embed|www)\.)%(host)s/embed(?:\.php|/)?\?(?:.*?&)?\bv=
+ )
+ (?P<id>[a-z\d]{13})
+ '''
_VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'}
_HOST = 'www.novamov.com'
@@ -28,17 +34,7 @@ class NovaMovIE(InfoExtractor):
_DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>'
_URL_TEMPLATE = 'http://%s/video/%s'
- _TEST = {
- 'url': 'http://www.novamov.com/video/4rurhn9x446jj',
- 'md5': '7205f346a52bbeba427603ba10d4b935',
- 'info_dict': {
- 'id': '4rurhn9x446jj',
- 'ext': 'flv',
- 'title': 'search engine optimization',
- 'description': 'search engine optimization is used to rank the web page in the google search engine'
- },
- 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)'
- }
+ _TEST = None
def _check_existence(self, webpage, video_id):
if re.search(self._FILE_DELETED_REGEX, webpage) is not None:
@@ -73,7 +69,7 @@ class NovaMovIE(InfoExtractor):
if not post_url.startswith('http'):
post_url = compat_urlparse.urljoin(url, post_url)
request = sanitized_Request(
- post_url, urlencode_postdata(encode_dict(fields)))
+ post_url, urlencode_postdata(fields))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
request.add_header('Referer', post_url)
webpage = self._download_webpage(
@@ -82,7 +78,7 @@ class NovaMovIE(InfoExtractor):
filekey = extract_filekey()
- title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title', fatal=False)
+ title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title')
description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False)
api_response = self._download_webpage(
@@ -188,3 +184,29 @@ class CloudTimeIE(NovaMovIE):
_TITLE_REGEX = r'<div[^>]+class=["\']video_det["\'][^>]*>\s*<strong>([^<]+)</strong>'
_TEST = None
+
+
+class AuroraVidIE(NovaMovIE):
+ IE_NAME = 'auroravid'
+ IE_DESC = 'AuroraVid'
+
+ _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'auroravid\.to'}
+
+ _HOST = 'www.auroravid.to'
+
+ _FILE_DELETED_REGEX = r'This file no longer exists on our servers!<'
+
+ _TESTS = [{
+ 'url': 'http://www.auroravid.to/video/4rurhn9x446jj',
+ 'md5': '7205f346a52bbeba427603ba10d4b935',
+ 'info_dict': {
+ 'id': '4rurhn9x446jj',
+ 'ext': 'flv',
+ 'title': 'search engine optimization',
+ 'description': 'search engine optimization is used to rank the web page in the google search engine'
+ },
+ 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)'
+ }, {
+ 'url': 'http://www.auroravid.to/embed/?v=4rurhn9x446jj',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index 446f590..74860eb 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -63,8 +63,11 @@ class NownessIE(NownessBaseIE):
'title': 'Candor: The Art of Gesticulation',
'description': 'Candor: The Art of Gesticulation',
'thumbnail': 're:^https?://.*\.jpg',
- 'uploader': 'Nowness',
+ 'timestamp': 1446745676,
+ 'upload_date': '20151105',
+ 'uploader_id': '2385340575001',
},
+ 'add_ie': ['BrightcoveNew'],
}, {
'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr',
'md5': 'e79cf125e387216f86b2e0a5b5c63aa3',
@@ -74,8 +77,11 @@ class NownessIE(NownessBaseIE):
'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR',
'thumbnail': 're:^https?://.*\.jpg',
- 'uploader': 'Nowness',
+ 'timestamp': 1407315371,
+ 'upload_date': '20140806',
+ 'uploader_id': '2385340575001',
},
+ 'add_ie': ['BrightcoveNew'],
}, {
# vimeo
'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut',
@@ -90,6 +96,7 @@ class NownessIE(NownessBaseIE):
'uploader': 'Cinema Sem Lei',
'uploader_id': 'cinemasemlei',
},
+ 'add_ie': ['Vimeo'],
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py
index 0ffb44b..c47a33d 100644
--- a/youtube_dl/extractor/noz.py
+++ b/youtube_dl/extractor/noz.py
@@ -2,10 +2,15 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_xpath,
+)
from ..utils import (
int_or_none,
+ find_xpath_attr,
xpath_text,
+ update_url_query,
)
@@ -45,18 +50,33 @@ class NozIE(InfoExtractor):
duration = int_or_none(xpath_text(
doc, './/article/movie/file/duration'))
formats = []
- for qnode in doc.findall('.//article/movie/file/qualities/qual'):
- video_node = qnode.find('./html_urls/video_url[@format="video/mp4"]')
- if video_node is None:
- continue # auto
- formats.append({
- 'url': video_node.text,
- 'format_name': xpath_text(qnode, './name'),
- 'format_id': xpath_text(qnode, './id'),
- 'height': int_or_none(xpath_text(qnode, './height')),
- 'width': int_or_none(xpath_text(qnode, './width')),
- 'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000),
- })
+ for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')):
+ http_url_ele = find_xpath_attr(
+ qnode, './html_urls/video_url', 'format', 'video/mp4')
+ http_url = http_url_ele.text if http_url_ele is not None else None
+ if http_url:
+ formats.append({
+ 'url': http_url,
+ 'format_name': xpath_text(qnode, './name'),
+ 'format_id': '%s-%s' % ('http', xpath_text(qnode, './id')),
+ 'height': int_or_none(xpath_text(qnode, './height')),
+ 'width': int_or_none(xpath_text(qnode, './width')),
+ 'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000),
+ })
+ else:
+ f4m_url = xpath_text(qnode, 'url_hd2')
+ if f4m_url:
+ formats.extend(self._extract_f4m_formats(
+ update_url_query(f4m_url, {'hdcore': '3.4.0'}),
+ video_id, f4m_id='hds', fatal=False))
+ m3u8_url_ele = find_xpath_attr(
+ qnode, './html_urls/video_url',
+ 'format', 'application/vnd.apple.mpegurl')
+ m3u8_url = m3u8_url_ele.text if m3u8_url_ele is not None else None
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py
index 125c701..1777aa1 100644
--- a/youtube_dl/extractor/npr.py
+++ b/youtube_dl/extractor/npr.py
@@ -1,7 +1,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
from ..utils import (
int_or_none,
qualities,
@@ -9,7 +9,7 @@ from ..utils import (
class NprIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205',
'info_dict': {
@@ -38,7 +38,7 @@ class NprIE(InfoExtractor):
playlist_id = self._match_id(url)
config = self._download_json(
- 'http://api.npr.org/query?%s' % compat_urllib_parse.urlencode({
+ 'http://api.npr.org/query?%s' % compat_urllib_parse_urlencode({
'id': playlist_id,
'fields': 'titles,audio,show',
'format': 'json',
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index a126f50..6ded5bd 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -4,90 +4,222 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
- determine_ext,
ExtractorError,
- float_or_none,
+ int_or_none,
+ parse_age_limit,
parse_duration,
- unified_strdate,
)
-class NRKIE(InfoExtractor):
- _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
-
- _TESTS = [
- {
- 'url': 'http://www.nrk.no/video/PS*150533',
- 'md5': 'bccd850baebefe23b56d708a113229c2',
- 'info_dict': {
- 'id': '150533',
- 'ext': 'flv',
- 'title': 'Dompap og andre fugler i Piip-Show',
- 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
- 'duration': 263,
- }
- },
- {
- 'url': 'http://www.nrk.no/video/PS*154915',
- 'md5': '0b1493ba1aae7d9579a5ad5531bc395a',
- 'info_dict': {
- 'id': '154915',
- 'ext': 'flv',
- 'title': 'Slik høres internett ut når du er blind',
- 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
- 'duration': 20,
- }
- },
- ]
+class NRKBaseIE(InfoExtractor):
+ def _extract_formats(self, manifest_url, video_id, fatal=True):
+ formats = []
+ formats.extend(self._extract_f4m_formats(
+ manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81',
+ video_id, f4m_id='hds', fatal=fatal))
+ formats.extend(self._extract_m3u8_formats(manifest_url.replace(
+ 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'),
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal))
+ return formats
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
- 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id,
- video_id, 'Downloading media JSON')
+ 'http://%s/mediaelement/%s' % (self._API_HOST, video_id),
+ video_id, 'Downloading mediaelement JSON')
+
+ title = data.get('fullTitle') or data.get('mainTitle') or data['title']
+ video_id = data.get('id') or video_id
+
+ entries = []
+
+ media_assets = data.get('mediaAssets')
+ if media_assets and isinstance(media_assets, list):
+ def video_id_and_title(idx):
+ return ((video_id, title) if len(media_assets) == 1
+ else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
+ for num, asset in enumerate(media_assets, 1):
+ asset_url = asset.get('url')
+ if not asset_url:
+ continue
+ formats = self._extract_formats(asset_url, video_id, fatal=False)
+ if not formats:
+ continue
+ self._sort_formats(formats)
+ entry_id, entry_title = video_id_and_title(num)
+ duration = parse_duration(asset.get('duration'))
+ subtitles = {}
+ for subtitle in ('webVtt', 'timedText'):
+ subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
+ if subtitle_url:
+ subtitles.setdefault('no', []).append({
+ 'url': compat_urllib_parse_unquote(subtitle_url)
+ })
+ entries.append({
+ 'id': asset.get('carrierId') or entry_id,
+ 'title': entry_title,
+ 'duration': duration,
+ 'subtitles': subtitles,
+ 'formats': formats,
+ })
- media_url = data.get('mediaUrl')
+ if not entries:
+ media_url = data.get('mediaUrl')
+ if media_url:
+ formats = self._extract_formats(media_url, video_id)
+ self._sort_formats(formats)
+ duration = parse_duration(data.get('duration'))
+ entries = [{
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'formats': formats,
+ }]
- if not media_url:
- if data['usageRights']['isGeoBlocked']:
+ if not entries:
+ if data.get('usageRights', {}).get('isGeoBlocked'):
raise ExtractorError(
'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
expected=True)
- if determine_ext(media_url) == 'f4m':
- formats = self._extract_f4m_formats(
- media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds')
- else:
- formats = [{
- 'url': media_url,
- 'ext': 'flv',
- }]
-
- duration = parse_duration(data.get('duration'))
+ conviva = data.get('convivaStatistics') or {}
+ series = conviva.get('seriesName') or data.get('seriesTitle')
+ episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
+ thumbnails = None
images = data.get('images')
- if images:
- thumbnails = images['webImages']
- thumbnails.sort(key=lambda image: image['pixelWidth'])
- thumbnail = thumbnails[-1]['imageUrl']
- else:
- thumbnail = None
-
- return {
- 'id': video_id,
- 'title': data['title'],
- 'description': data['description'],
- 'duration': duration,
- 'thumbnail': thumbnail,
- 'formats': formats,
+ if images and isinstance(images, dict):
+ web_images = images.get('webImages')
+ if isinstance(web_images, list):
+ thumbnails = [{
+ 'url': image['imageUrl'],
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in web_images if image.get('imageUrl')]
+
+ description = data.get('description')
+
+ common_info = {
+ 'description': description,
+ 'series': series,
+ 'episode': episode,
+ 'age_limit': parse_age_limit(data.get('legalAge')),
+ 'thumbnails': thumbnails,
+ }
+
+ vcodec = 'none' if data.get('mediaType') == 'Audio' else None
+
+ # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
+
+ for entry in entries:
+ entry.update(common_info)
+ for f in entry['formats']:
+ f['vcodec'] = vcodec
+
+ return self.playlist_result(entries, video_id, title, description)
+
+
+class NRKIE(NRKBaseIE):
+ _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)'
+ _API_HOST = 'v8.psapi.nrk.no'
+ _TESTS = [{
+ # video
+ 'url': 'http://www.nrk.no/video/PS*150533',
+ 'md5': '2f7f6eeb2aacdd99885f355428715cfa',
+ 'info_dict': {
+ 'id': '150533',
+ 'ext': 'mp4',
+ 'title': 'Dompap og andre fugler i Piip-Show',
+ 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
+ 'duration': 263,
}
+ }, {
+ # audio
+ 'url': 'http://www.nrk.no/video/PS*154915',
+ # MD5 is unstable
+ 'info_dict': {
+ 'id': '154915',
+ 'ext': 'flv',
+ 'title': 'Slik høres internett ut når du er blind',
+ 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+ 'duration': 20,
+ }
+ }]
+
+
+class NRKTVIE(NRKBaseIE):
+ IE_DESC = 'NRK TV and NRK Radio'
+ _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+ _API_HOST = 'psapi-we.nrk.no'
+
+ _TESTS = [{
+ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
+ 'md5': '4e9ca6629f09e588ed240fb11619922a',
+ 'info_dict': {
+ 'id': 'MUHH48000314AA',
+ 'ext': 'mp4',
+ 'title': '20 spørsmål 23.05.2014',
+ 'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+ 'duration': 1741,
+ },
+ }, {
+ 'url': 'https://tv.nrk.no/program/mdfp15000514',
+ 'md5': '43d0be26663d380603a9cf0c24366531',
+ 'info_dict': {
+ 'id': 'MDFP15000514CA',
+ 'ext': 'mp4',
+ 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014',
+ 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db',
+ 'duration': 4605,
+ },
+ }, {
+ # single playlist video
+ 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
+ 'md5': 'adbd1dbd813edaf532b0a253780719c2',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part2',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ },
+ 'skip': 'Only works from Norway',
+ }, {
+ 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
+ 'playlist': [{
+ 'md5': '9480285eff92d64f06e02a5367970a7a',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part1',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ },
+ }, {
+ 'md5': 'adbd1dbd813edaf532b0a253780719c2',
+ 'info_dict': {
+ 'id': 'MSPO40010515-part2',
+ 'ext': 'flv',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ },
+ }],
+ 'info_dict': {
+ 'id': 'MSPO40010515',
+ 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
+ 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
+ 'duration': 6947.52,
+ },
+ 'skip': 'Only works from Norway',
+ }, {
+ 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
+ 'only_matching': True,
+ }]
class NRKPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763',
@@ -126,177 +258,36 @@ class NRKPlaylistIE(InfoExtractor):
entries, playlist_id, playlist_title, playlist_description)
-class NRKTVIE(InfoExtractor):
- IE_DESC = 'NRK TV and NRK Radio'
- _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'
+class NRKSkoleIE(InfoExtractor):
+ IE_DESC = 'NRK Skole'
+ _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'
- _TESTS = [
- {
- 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
- 'info_dict': {
- 'id': 'MUHH48000314',
- 'ext': 'mp4',
- 'title': '20 spørsmål',
- 'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
- 'upload_date': '20140523',
- 'duration': 1741.52,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
- {
- 'url': 'https://tv.nrk.no/program/mdfp15000514',
- 'info_dict': {
- 'id': 'mdfp15000514',
- 'ext': 'mp4',
- 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting',
- 'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
- 'upload_date': '20140524',
- 'duration': 4605.08,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- },
- {
- # single playlist video
- 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
- 'md5': 'adbd1dbd813edaf532b0a253780719c2',
- 'info_dict': {
- 'id': 'MSPO40010515-part2',
- 'ext': 'flv',
- 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
- 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
- 'upload_date': '20150106',
- },
- 'skip': 'Only works from Norway',
- },
- {
- 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
- 'playlist': [
- {
- 'md5': '9480285eff92d64f06e02a5367970a7a',
- 'info_dict': {
- 'id': 'MSPO40010515-part1',
- 'ext': 'flv',
- 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)',
- 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
- 'upload_date': '20150106',
- },
- },
- {
- 'md5': 'adbd1dbd813edaf532b0a253780719c2',
- 'info_dict': {
- 'id': 'MSPO40010515-part2',
- 'ext': 'flv',
- 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)',
- 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
- 'upload_date': '20150106',
- },
- },
- ],
- 'info_dict': {
- 'id': 'MSPO40010515',
- 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn',
- 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26',
- 'upload_date': '20150106',
- 'duration': 6947.5199999999995,
- },
- 'skip': 'Only works from Norway',
+ _TESTS = [{
+ 'url': 'https://www.nrk.no/skole/?page=search&q=&mediaId=14099',
+ 'md5': '6bc936b01f9dd8ed45bc58b252b2d9b6',
+ 'info_dict': {
+ 'id': '6021',
+ 'ext': 'mp4',
+ 'title': 'Genetikk og eneggede tvillinger',
+ 'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d',
+ 'duration': 399,
},
- {
- 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
- 'only_matching': True,
- }
- ]
-
- def _extract_f4m(self, manifest_url, video_id):
- return self._extract_f4m_formats(
- manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds')
+ }, {
+ 'url': 'https://www.nrk.no/skole/?page=objectives&subject=naturfag&objective=K15114&mediaId=19355',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- part_id = mobj.group('part_id')
- base_url = mobj.group('baseurl')
-
- webpage = self._download_webpage(url, video_id)
-
- title = self._html_search_meta(
- 'title', webpage, 'title')
- description = self._html_search_meta(
- 'description', webpage, 'description')
-
- thumbnail = self._html_search_regex(
- r'data-posterimage="([^"]+)"',
- webpage, 'thumbnail', fatal=False)
- upload_date = unified_strdate(self._html_search_meta(
- 'rightsfrom', webpage, 'upload date', fatal=False))
- duration = float_or_none(self._html_search_regex(
- r'data-duration="([^"]+)"',
- webpage, 'duration', fatal=False))
-
- # playlist
- parts = re.findall(
- r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage)
- if parts:
- entries = []
- for current_part_id, stream_url, part_title in parts:
- if part_id and current_part_id != part_id:
- continue
- video_part_id = '%s-part%s' % (video_id, current_part_id)
- formats = self._extract_f4m(stream_url, video_part_id)
- entries.append({
- 'id': video_part_id,
- 'title': part_title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'formats': formats,
- })
- if part_id:
- if entries:
- return entries[0]
- else:
- playlist = self.playlist_result(entries, video_id, title, description)
- playlist.update({
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'duration': duration,
- })
- return playlist
+ video_id = self._match_id(url)
- formats = []
+ webpage = self._download_webpage(
+ 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id,
+ video_id)
- f4m_url = re.search(r'data-media="([^"]+)"', webpage)
- if f4m_url:
- formats.extend(self._extract_f4m(f4m_url.group(1), video_id))
-
- m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls'))
- self._sort_formats(formats)
-
- subtitles_url = self._html_search_regex(
- r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1',
- webpage, 'subtitle URL', default=None, group='url')
- subtitles = {}
- if subtitles_url:
- subtitles['no'] = [{
- 'ext': 'ttml',
- 'url': compat_urlparse.urljoin(base_url, subtitles_url),
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'duration': duration,
- 'formats': formats,
- 'subtitles': subtitles,
- }
+ nrk_id = self._parse_json(
+ self._search_regex(
+ r'<script[^>]+type=["\']application/json["\'][^>]*>({.+?})</script>',
+ webpage, 'application json'),
+ video_id)['activeMedia']['psId']
+
+ return self.url_result('nrk:%s' % nrk_id)
diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py
index 2cd924d..0895d7e 100644
--- a/youtube_dl/extractor/ntvru.py
+++ b/youtube_dl/extractor/ntvru.py
@@ -11,7 +11,7 @@ from ..utils import (
class NTVRuIE(InfoExtractor):
IE_NAME = 'ntv.ru'
- _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)'
_TESTS = [
{
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py
index 9fa7cef..ab6bfcd 100644
--- a/youtube_dl/extractor/nuvid.py
+++ b/youtube_dl/extractor/nuvid.py
@@ -5,8 +5,6 @@ import re
from .common import InfoExtractor
from ..utils import (
parse_duration,
- sanitized_Request,
- unified_strdate,
)
@@ -20,7 +18,6 @@ class NuvidIE(InfoExtractor):
'ext': 'mp4',
'title': 'Horny babes show their awesome bodeis and',
'duration': 129,
- 'upload_date': '20140508',
'age_limit': 18,
}
}
@@ -28,28 +25,31 @@ class NuvidIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- formats = []
+ page_url = 'http://m.nuvid.com/video/%s' % video_id
+ webpage = self._download_webpage(
+ page_url, video_id, 'Downloading video page')
+ # When dwnld_speed exists and has a value larger than the MP4 file's
+ # bitrate, Nuvid returns the MP4 URL
+ # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm
+ self._set_cookie('nuvid.com', 'dwnld_speed', '10.0')
+ mp4_webpage = self._download_webpage(
+ page_url, video_id, 'Downloading video page for MP4 format')
- for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]:
- request = sanitized_Request(
- 'http://m.nuvid.com/play/%s' % video_id)
- request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed)
- webpage = self._download_webpage(
- request, video_id, 'Downloading %s page' % format_id)
- video_url = self._html_search_regex(
- r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False)
- if not video_url:
- continue
+ html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']',
+ video_url = self._html_search_regex(html5_video_re, webpage, video_id)
+ mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id)
+ formats = [{
+ 'url': video_url,
+ }]
+ if mp4_video_url != video_url:
formats.append({
- 'url': video_url,
- 'format_id': format_id,
+ 'url': mp4_video_url,
})
- webpage = self._download_webpage(
- 'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page')
title = self._html_search_regex(
[r'<span title="([^"]+)">',
- r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip()
+ r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>',
+ r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip()
thumbnails = [
{
'url': thumb_url,
@@ -57,9 +57,8 @@ class NuvidIE(InfoExtractor):
]
thumbnail = thumbnails[0]['url'] if thumbnails else None
duration = parse_duration(self._html_search_regex(
- r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False))
- upload_date = unified_strdate(self._html_search_regex(
- r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False))
+ [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})',
+ r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False))
return {
'id': video_id,
@@ -67,7 +66,6 @@ class NuvidIE(InfoExtractor):
'thumbnails': thumbnails,
'thumbnail': thumbnail,
'duration': duration,
- 'upload_date': upload_date,
'age_limit': 18,
'formats': formats,
}
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
index 7f254b8..681683e 100644
--- a/youtube_dl/extractor/nytimes.py
+++ b/youtube_dl/extractor/nytimes.py
@@ -18,8 +18,9 @@ class NYTimesBaseIE(InfoExtractor):
description = video_data.get('summary')
duration = float_or_none(video_data.get('duration'), 1000)
- uploader = video_data['byline']
- timestamp = parse_iso8601(video_data['publication_date'][:-8])
+ uploader = video_data.get('byline')
+ publication_date = video_data.get('publication_date')
+ timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None
def get_file_size(file_size):
if isinstance(file_size, int):
@@ -37,7 +38,7 @@ class NYTimesBaseIE(InfoExtractor):
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'filesize': get_file_size(video.get('fileSize')),
- } for video in video_data['renditions']
+ } for video in video_data['renditions'] if video.get('url')
]
self._sort_formats(formats)
@@ -46,7 +47,7 @@ class NYTimesBaseIE(InfoExtractor):
'url': 'http://www.nytimes.com/%s' % image['url'],
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
- } for image in video_data['images']
+ } for image in video_data.get('images', []) if image.get('url')
]
return {
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index f9e064a..986708e 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -2,7 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
ExtractorError,
unified_strdate,
@@ -32,7 +36,7 @@ class OdnoklassnikiIE(InfoExtractor):
'skip': 'Video has been blocked',
}, {
# metadataUrl
- 'url': 'http://ok.ru/video/63567059965189-0',
+ 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
'md5': '9676cf86eff5391d35dea675d224e131',
'info_dict': {
'id': '63567059965189-0',
@@ -44,6 +48,7 @@ class OdnoklassnikiIE(InfoExtractor):
'uploader': '☭ Андрей Мещанинов ☭',
'like_count': int,
'age_limit': 0,
+ 'start_time': 5,
},
}, {
# YouTube embed (metadataUrl, provider == USER_YOUTUBE)
@@ -61,6 +66,22 @@ class OdnoklassnikiIE(InfoExtractor):
'age_limit': 0,
},
}, {
+ # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field)
+ 'url': 'http://ok.ru/video/62036049272859-0',
+ 'info_dict': {
+ 'id': '62036049272859-0',
+ 'ext': 'mp4',
+ 'title': 'МУЗЫКА ДОЖДЯ .',
+ 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0',
+ 'upload_date': '20120106',
+ 'uploader_id': '473534735899',
+ 'uploader': 'МARINA D',
+ 'age_limit': 0,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
'only_matching': True,
}, {
@@ -78,6 +99,9 @@ class OdnoklassnikiIE(InfoExtractor):
}]
def _real_extract(self, url):
+ start_time = int_or_none(compat_parse_qs(
+ compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
+
video_id = self._match_id(url)
webpage = self._download_webpage(
@@ -106,7 +130,14 @@ class OdnoklassnikiIE(InfoExtractor):
video_id, 'Downloading metadata JSON')
movie = metadata['movie']
- title = movie['title']
+
+ # Some embedded videos may not contain title in movie dict (e.g.
+ # http://ok.ru/video/62036049272859-0) thus we allow missing title
+ # here and it's going to be extracted later by an extractor that
+ # will process the actual embed.
+ provider = metadata.get('provider')
+ title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title')
+
thumbnail = movie.get('poster')
duration = int_or_none(movie.get('duration'))
@@ -135,9 +166,10 @@ class OdnoklassnikiIE(InfoExtractor):
'uploader_id': uploader_id,
'like_count': like_count,
'age_limit': age_limit,
+ 'start_time': start_time,
}
- if metadata.get('provider') == 'USER_YOUTUBE':
+ if provider == 'USER_YOUTUBE':
info.update({
'_type': 'url_transparent',
'url': movie['contentId'],
diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py
new file mode 100644
index 0000000..1bf96ea
--- /dev/null
+++ b/youtube_dl/extractor/once.py
@@ -0,0 +1,42 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class OnceIE(InfoExtractor):
+ _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)'
+ ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8'
+ PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4'
+
+ def _extract_once_formats(self, url):
+ domain_id, application_id, media_item_id = re.match(
+ OnceIE._VALID_URL, url).groups()
+ formats = self._extract_m3u8_formats(
+ self.ADAPTIVE_URL_TEMPLATE % (
+ domain_id, application_id, media_item_id),
+ media_item_id, 'mp4', m3u8_id='hls', fatal=False)
+ progressive_formats = []
+ for adaptive_format in formats:
+ # Prevent advertisement from embedding into m3u8 playlist (see
+ # https://github.com/rg3/youtube-dl/issues/8893#issuecomment-199912684)
+ adaptive_format['url'] = re.sub(
+ r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url'])
+ rendition_id = self._search_regex(
+ r'/now/media/playlist/[^/]+/[^/]+/([^/]+)',
+ adaptive_format['url'], 'redition id', default=None)
+ if rendition_id:
+ progressive_format = adaptive_format.copy()
+ progressive_format.update({
+ 'url': self.PROGRESSIVE_URL_TEMPLATE % (
+ domain_id, application_id, rendition_id, media_item_id),
+ 'format_id': adaptive_format['format_id'].replace(
+ 'hls', 'http'),
+ 'protocol': 'http',
+ })
+ progressive_formats.append(progressive_format)
+ self._check_formats(progressive_formats, media_item_id)
+ formats.extend(progressive_formats)
+ return formats
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
index 0f1f448..d7b13a0 100644
--- a/youtube_dl/extractor/onionstudios.py
+++ b/youtube_dl/extractor/onionstudios.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import determine_ext
+from ..utils import (
+ determine_ext,
+ int_or_none,
+)
class OnionStudiosIE(InfoExtractor):
@@ -17,7 +20,7 @@ class OnionStudiosIE(InfoExtractor):
'id': '2937',
'ext': 'mp4',
'title': 'Hannibal charges forward, stops for a cocktail',
- 'description': 'md5:545299bda6abf87e5ec666548c6a9448',
+ 'description': 'md5:e786add7f280b7f0fe237b64cc73df76',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'The A.V. Club',
'uploader_id': 'TheAVClub',
@@ -42,9 +45,19 @@ class OnionStudiosIE(InfoExtractor):
formats = []
for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
- if determine_ext(src) != 'm3u8': # m3u8 always results in 403
+ ext = determine_ext(src)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ height = int_or_none(self._search_regex(
+ r'/(\d+)\.%s' % ext, src, 'height', default=None))
formats.append({
+ 'format_id': ext + ('-%sp' % height if height else ''),
'url': src,
+ 'height': height,
+ 'ext': ext,
+ 'preference': 1,
})
self._sort_formats(formats)
@@ -52,7 +65,7 @@ class OnionStudiosIE(InfoExtractor):
r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
webpage, 'title', group='title')
description = self._search_regex(
- r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1',
+ r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1',
webpage, 'description', default=None, group='description')
thumbnail = self._search_regex(
r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 20b9842..2038a6b 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -8,78 +8,88 @@ from ..utils import (
float_or_none,
ExtractorError,
unsmuggle_url,
+ determine_ext,
)
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
class OoyalaBaseIE(InfoExtractor):
_PLAYER_BASE = 'http://player.ooyala.com/'
_CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
- _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?'
+ _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?'
def _extract(self, content_tree_url, video_id, domain='example.org'):
content_tree = self._download_json(content_tree_url, video_id)['content_tree']
metadata = content_tree[list(content_tree)[0]]
embed_code = metadata['embed_code']
pcode = metadata.get('asset_pcode') or embed_code
- video_info = {
- 'id': embed_code,
- 'title': metadata['title'],
- 'description': metadata.get('description'),
- 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
- 'duration': float_or_none(metadata.get('duration'), 1000),
- }
+ title = metadata['title']
+
+ auth_data = self._download_json(
+ self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
+ compat_urllib_parse_urlencode({
+ 'domain': domain,
+ 'supportedFormats': 'mp4,rtmp,m3u8,hds',
+ }), video_id)
+
+ cur_auth_data = auth_data['authorization_data'][embed_code]
urls = []
formats = []
- for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'):
- auth_data = self._download_json(
- self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
- compat_urllib_parse.urlencode({
- 'domain': domain,
- 'supportedFormats': supported_format
- }),
- video_id, 'Downloading %s JSON' % supported_format)
-
- cur_auth_data = auth_data['authorization_data'][embed_code]
-
- if cur_auth_data['authorized']:
- for stream in cur_auth_data['streams']:
- url = base64.b64decode(
- stream['url']['data'].encode('ascii')).decode('utf-8')
- if url in urls:
- continue
- urls.append(url)
- delivery_type = stream['delivery_type']
- if delivery_type == 'hls' or '.m3u8' in url:
- formats.extend(self._extract_m3u8_formats(
- url, embed_code, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- elif delivery_type == 'hds' or '.f4m' in url:
- formats.extend(self._extract_f4m_formats(
- url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
- elif '.smil' in url:
- formats.extend(self._extract_smil_formats(
- url, embed_code, fatal=False))
- else:
- formats.append({
- 'url': url,
- 'ext': stream.get('delivery_type'),
- 'vcodec': stream.get('video_codec'),
- 'format_id': delivery_type,
- 'width': int_or_none(stream.get('width')),
- 'height': int_or_none(stream.get('height')),
- 'abr': int_or_none(stream.get('audio_bitrate')),
- 'vbr': int_or_none(stream.get('video_bitrate')),
- 'fps': float_or_none(stream.get('framerate')),
- })
- else:
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, cur_auth_data['message']), expected=True)
+ if cur_auth_data['authorized']:
+ for stream in cur_auth_data['streams']:
+ s_url = base64.b64decode(
+ stream['url']['data'].encode('ascii')).decode('utf-8')
+ if s_url in urls:
+ continue
+ urls.append(s_url)
+ ext = determine_ext(s_url, None)
+ delivery_type = stream['delivery_type']
+ if delivery_type == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ s_url, embed_code, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif delivery_type == 'hds' or ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
+ elif ext == 'smil':
+ formats.extend(self._extract_smil_formats(
+ s_url, embed_code, fatal=False))
+ else:
+ formats.append({
+ 'url': s_url,
+ 'ext': ext or stream.get('delivery_type'),
+ 'vcodec': stream.get('video_codec'),
+ 'format_id': delivery_type,
+ 'width': int_or_none(stream.get('width')),
+ 'height': int_or_none(stream.get('height')),
+ 'abr': int_or_none(stream.get('audio_bitrate')),
+ 'vbr': int_or_none(stream.get('video_bitrate')),
+ 'fps': float_or_none(stream.get('framerate')),
+ })
+ else:
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, cur_auth_data['message']), expected=True)
self._sort_formats(formats)
- video_info['formats'] = formats
- return video_info
+ subtitles = {}
+ for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items():
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ subtitles[lang] = [{
+ 'url': sub_url,
+ }]
+
+ return {
+ 'id': embed_code,
+ 'title': title,
+ 'description': metadata.get('description'),
+ 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
+ 'duration': float_or_none(metadata.get('duration'), 1000),
+ 'subtitles': subtitles,
+ 'formats': formats,
+ }
class OoyalaIE(OoyalaBaseIE):
@@ -96,6 +106,8 @@ class OoyalaIE(OoyalaBaseIE):
'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
'duration': 853.386,
},
+ # The video in the original webpage now uses PlayWire
+ 'skip': 'Ooyala said: movie expired',
}, {
# Only available for ipad
'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0',
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py
new file mode 100644
index 0000000..6415b8f
--- /dev/null
+++ b/youtube_dl/extractor/openload.py
@@ -0,0 +1,130 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_chr
+from ..utils import (
+ determine_ext,
+ encode_base_n,
+ ExtractorError,
+ mimetype2ext,
+)
+
+
+class OpenloadIE(InfoExtractor):
+ _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)'
+
+ _TESTS = [{
+ 'url': 'https://openload.co/f/kUEfGclsU9o',
+ 'md5': 'bf1c059b004ebc7a256f89408e65c36e',
+ 'info_dict': {
+ 'id': 'kUEfGclsU9o',
+ 'ext': 'mp4',
+ 'title': 'skyrim_no-audio_1080.mp4',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ },
+ }, {
+ 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://openload.io/f/ZAn6oz-VZGE/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://openload.co/f/_-ztPaZtMhM/',
+ 'only_matching': True,
+ }, {
+ # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout
+ # for title and ext
+ 'url': 'https://openload.co/embed/Sxz5sADo82g/',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def openload_level2_debase(m):
+ radix, num = int(m.group(1)) + 27, int(m.group(2))
+ return '"' + encode_base_n(num, radix) + '"'
+
+ @classmethod
+ def openload_level2(cls, txt):
+ # The function name is ǃ \u01c3
+ # Using escaped unicode literals does not work in Python 3.2
+ return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '')
+
+ # Openload uses a variant of aadecode
+ # openload_decode and related functions are originally written by
+ # vitas@matfyz.cz and released with public domain
+ # See https://github.com/rg3/youtube-dl/issues/8489
+ @classmethod
+ def openload_decode(cls, txt):
+ symbol_table = [
+ ('_', '(゚Д゚) [゚Θ゚]'),
+ ('a', '(゚Д゚) [゚ω゚ノ]'),
+ ('b', '(゚Д゚) [゚Θ゚ノ]'),
+ ('c', '(゚Д゚) [\'c\']'),
+ ('d', '(゚Д゚) [゚ー゚ノ]'),
+ ('e', '(゚Д゚) [゚Д゚ノ]'),
+ ('f', '(゚Д゚) [1]'),
+
+ ('o', '(゚Д゚) [\'o\']'),
+ ('u', '(o゚ー゚o)'),
+ ('c', '(゚Д゚) [\'c\']'),
+
+ ('7', '((゚ー゚) + (o^_^o))'),
+ ('6', '((o^_^o) +(o^_^o) +(c^_^o))'),
+ ('5', '((゚ー゚) + (゚Θ゚))'),
+ ('4', '(-~3)'),
+ ('3', '(-~-~1)'),
+ ('2', '(-~1)'),
+ ('1', '(-~0)'),
+ ('0', '((c^_^o)-(c^_^o))'),
+ ]
+ delim = '(゚Д゚)[゚ε゚]+'
+ ret = ''
+ for aachar in txt.split(delim):
+ for val, pat in symbol_table:
+ aachar = aachar.replace(pat, val)
+ aachar = aachar.replace('+ ', '')
+ m = re.match(r'^\d+', aachar)
+ if m:
+ ret += compat_chr(int(m.group(0), 8))
+ else:
+ m = re.match(r'^u([\da-f]+)', aachar)
+ if m:
+ ret += compat_chr(int(m.group(1), 16))
+ return cls.openload_level2(ret)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ if 'File not found' in webpage:
+ raise ExtractorError('File not found', expected=True)
+
+ code = self._search_regex(
+ r'</video>\s*</div>\s*<script[^>]+>[^>]+</script>\s*<script[^>]+>([^<]+)</script>',
+ webpage, 'JS code')
+
+ decoded = self.openload_decode(code)
+
+ video_url = self._search_regex(
+ r'return\s+"(https?://[^"]+)"', decoded, 'video URL')
+
+ title = self._og_search_title(webpage, default=None) or self._search_regex(
+ r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
+ 'title', default=None) or self._html_search_meta(
+ 'description', webpage, 'title', fatal=True)
+
+ ext = mimetype2ext(self._search_regex(
+ r'window\.vt\s*=\s*(["\'])(?P<mimetype>.+?)\1', decoded,
+ 'mimetype', default=None, group='mimetype')) or determine_ext(
+ video_url, 'mp4')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'ext': ext,
+ 'thumbnail': self._og_search_thumbnail(webpage, default=None),
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/ora.py b/youtube_dl/extractor/ora.py
index 8545fb1..1d42be3 100644
--- a/youtube_dl/extractor/ora.py
+++ b/youtube_dl/extractor/ora.py
@@ -12,8 +12,8 @@ from ..utils import (
class OraTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ora\.tv/([^/]+/)*(?P<id>[^/\?#]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:ora\.tv|unsafespeech\.com)/([^/]+/)*(?P<id>[^/\?#]+)'
+ _TESTS = [{
'url': 'https://www.ora.tv/larrykingnow/2015/12/16/vine-youtube-stars-zach-king-king-bach-on-their-viral-videos-0_36jupg6090pq',
'md5': 'fa33717591c631ec93b04b0e330df786',
'info_dict': {
@@ -22,7 +22,10 @@ class OraTVIE(InfoExtractor):
'title': 'Vine & YouTube Stars Zach King & King Bach On Their Viral Videos!',
'description': 'md5:ebbc5b1424dd5dba7be7538148287ac1',
}
- }
+ }, {
+ 'url': 'http://www.unsafespeech.com/video/2016/5/10/student-self-censorship-and-the-thought-police-on-university-campuses-0_6622bnkppw4d',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 958eb39..4e3864f 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -137,7 +137,7 @@ class ORFTVthekIE(InfoExtractor):
class ORFOE1IE(InfoExtractor):
IE_NAME = 'orf:oe1'
IE_DESC = 'Radio Österreich 1'
- _VALID_URL = r'http://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)'
# Audios on ORF radio are only available for 7 days, so we can't add tests.
_TEST = {
@@ -171,7 +171,7 @@ class ORFOE1IE(InfoExtractor):
class ORFFM4IE(InfoExtractor):
IE_NAME = 'orf:fm4'
IE_DESC = 'radio FM4'
- _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
+ _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
_TEST = {
'url': 'http://fm4.orf.at/player/20160110/IS/',
@@ -185,6 +185,7 @@ class ORFFM4IE(InfoExtractor):
'timestamp': 1452456073,
'upload_date': '20160110',
},
+ 'skip': 'Live streams on FM4 got deleted soon',
}
def _real_extract(self, url):
@@ -222,7 +223,7 @@ class ORFFM4IE(InfoExtractor):
class ORFIPTVIE(InfoExtractor):
IE_NAME = 'orf:iptv'
IE_DESC = 'iptv.ORF.at'
- _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
+ _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
_TEST = {
'url': 'http://iptv.orf.at/stories/2275236/',
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index ec8876c..2297506 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -65,7 +65,7 @@ class PatreonIE(InfoExtractor):
request = sanitized_Request(
'https://www.patreon.com/processLogin',
- compat_urllib_parse.urlencode(login_form).encode('utf-8')
+ compat_urllib_parse_urlencode(login_form).encode('utf-8')
)
login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index f43e3a1..81918ac 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -196,7 +196,7 @@ class PBSIE(InfoExtractor):
_TESTS = [
{
'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
- 'md5': 'ce1888486f0908d555a8093cac9a7362',
+ 'md5': '173dc391afd361fa72eab5d3d918968d',
'info_dict': {
'id': '2365006249',
'ext': 'mp4',
@@ -204,13 +204,10 @@ class PBSIE(InfoExtractor):
'description': 'md5:36f341ae62e251b8f5bd2b754b95a071',
'duration': 3190,
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
- 'md5': '143c98aa54a346738a3d78f54c925321',
+ 'md5': '6f722cb3c3982186d34b0f13374499c7',
'info_dict': {
'id': '2365297690',
'ext': 'mp4',
@@ -218,9 +215,6 @@ class PBSIE(InfoExtractor):
'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9',
'duration': 5050,
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- }
},
{
'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@@ -244,9 +238,6 @@ class PBSIE(InfoExtractor):
'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
},
{
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@@ -262,9 +253,6 @@ class PBSIE(InfoExtractor):
'upload_date': '20140122',
'age_limit': 10,
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
},
{
'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@@ -290,6 +278,7 @@ class PBSIE(InfoExtractor):
},
{
'url': 'http://www.pbs.org/video/2365245528/',
+ 'md5': '115223d41bd55cda8ae5cd5ed4e11497',
'info_dict': {
'id': '2365245528',
'display_id': '2365245528',
@@ -299,15 +288,13 @@ class PBSIE(InfoExtractor):
'duration': 6851,
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
},
{
# Video embedded in iframe containing angle brackets as attribute's value (e.g.
# "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
# https://github.com/rg3/youtube-dl/issues/7059)
'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
+ 'md5': '84ced42850d78f1d4650297356e95e6f',
'info_dict': {
'id': '2365546844',
'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
@@ -317,9 +304,6 @@ class PBSIE(InfoExtractor):
'duration': 1480,
'thumbnail': 're:^https?://.*\.jpg$',
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
},
{
# Frontline video embedded via flp2012.js
@@ -340,6 +324,7 @@ class PBSIE(InfoExtractor):
{
# Serves hd only via wigget/partnerplayer page
'url': 'http://www.pbs.org/video/2365641075/',
+ 'md5': 'acfd4c400b48149a44861cb16dd305cf',
'info_dict': {
'id': '2365641075',
'ext': 'mp4',
@@ -348,9 +333,6 @@ class PBSIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'formats': 'mincount:8',
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
},
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
@@ -494,6 +476,7 @@ class PBSIE(InfoExtractor):
info = video_info
formats = []
+ http_url = None
for num, redirect in enumerate(redirects):
redirect_id = redirect.get('eeid')
@@ -514,13 +497,32 @@ class PBSIE(InfoExtractor):
if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- format_url, display_id, 'mp4', preference=1, m3u8_id='hls'))
+ format_url, display_id, 'mp4', m3u8_id='hls', fatal=False))
else:
formats.append({
'url': format_url,
'format_id': redirect_id,
})
+ if re.search(r'^https?://.*(?:\d+k|baseline)', format_url):
+ http_url = format_url
self._remove_duplicate_formats(formats)
+ m3u8_formats = list(filter(
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ formats))
+ if http_url:
+ for m3u8_format in m3u8_formats:
+ bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
+ # extract only the formats that we know that they will be available as http format.
+ # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
+ if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
+ continue
+ f = m3u8_format.copy()
+ f.update({
+ 'url': re.sub(r'\d+k|baseline', bitrate, http_url),
+ 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(f)
self._sort_formats(formats)
rating_str = info.get('rating')
@@ -535,6 +537,19 @@ class PBSIE(InfoExtractor):
'ext': 'ttml',
'url': closed_captions_url,
}]
+ mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url)
+ if mobj:
+ ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1)
+ ttml_caption_id = int(ttml_caption_id)
+ subtitles['en'].extend([{
+ 'url': closed_captions_url.replace(
+ ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)),
+ 'ext': 'srt',
+ }, {
+ 'url': closed_captions_url.replace(
+ ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)),
+ 'ext': 'vtt',
+ }])
# info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
# Try turning it to 'program - title' naming scheme if possible
diff --git a/youtube_dl/extractor/people.py b/youtube_dl/extractor/people.py
new file mode 100644
index 0000000..9ecdbc1
--- /dev/null
+++ b/youtube_dl/extractor/people.py
@@ -0,0 +1,32 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class PeopleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?people\.com/people/videos/0,,(?P<id>\d+),00\.html'
+
+ _TEST = {
+ 'url': 'http://www.people.com/people/videos/0,,20995451,00.html',
+ 'info_dict': {
+ 'id': 'ref:20995451',
+ 'ext': 'mp4',
+ 'title': 'Astronaut Love Triangle Victim Speaks Out: “The Crime in 2007 Hasn’t Defined Us”',
+ 'description': 'Colleen Shipman speaks to PEOPLE for the first time about life after the attack',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'duration': 246.318,
+ 'timestamp': 1458720585,
+ 'upload_date': '20160323',
+ 'uploader_id': '416418724',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['BrightcoveNew'],
+ }
+
+ def _real_extract(self, url):
+ return self.url_result(
+ 'http://players.brightcove.net/416418724/default_default/index.html?videoId=ref:%s'
+ % self._match_id(url), 'BrightcoveNew')
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index 514e9b4..c23b314 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -2,11 +2,15 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import parse_iso8601
+from ..utils import (
+ parse_iso8601,
+ unescapeHTML,
+)
class PeriscopeIE(InfoExtractor):
IE_DESC = 'Periscope'
+ IE_NAME = 'periscope'
_VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
# Alive example URLs can be found here http://onperiscope.com/
_TESTS = [{
@@ -41,8 +45,11 @@ class PeriscopeIE(InfoExtractor):
broadcast = broadcast_data['broadcast']
status = broadcast['status']
- uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name')
- uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id')
+ user = broadcast_data.get('user', {})
+
+ uploader = broadcast.get('user_display_name') or user.get('display_name')
+ uploader_id = (broadcast.get('username') or user.get('username') or
+ broadcast.get('user_id') or user.get('id'))
title = '%s - %s' % (uploader, status) if uploader else status
state = broadcast.get('state').lower()
@@ -79,3 +86,43 @@ class PeriscopeIE(InfoExtractor):
'thumbnails': thumbnails,
'formats': formats,
}
+
+
+class PeriscopeUserIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$'
+ IE_DESC = 'Periscope user videos'
+ IE_NAME = 'periscope:user'
+
+ _TEST = {
+ 'url': 'https://www.periscope.tv/LularoeHusbandMike/',
+ 'info_dict': {
+ 'id': 'LularoeHusbandMike',
+ 'title': 'LULAROE HUSBAND MIKE',
+ 'description': 'md5:6cf4ec8047768098da58e446e82c82f0',
+ },
+ # Periscope only shows videos in the last 24 hours, so it's possible to
+ # get 0 videos
+ 'playlist_mincount': 0,
+ }
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, user_id)
+
+ data_store = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'data-store=(["\'])(?P<data>.+?)\1',
+ webpage, 'data store', default='{}', group='data')),
+ user_id)
+
+ user = data_store.get('User', {}).get('user', {})
+ title = user.get('display_name') or user.get('username')
+ description = user.get('description')
+
+ entries = [
+ self.url_result(
+ 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
+ for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])]
+
+ return self.playlist_result(entries, user_id, title, description)
diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py
index 6e60e5f..f1008ae 100644
--- a/youtube_dl/extractor/philharmoniedeparis.py
+++ b/youtube_dl/extractor/philharmoniedeparis.py
@@ -12,7 +12,7 @@ from ..utils import (
class PhilharmonieDeParisIE(InfoExtractor):
IE_DESC = 'Philharmonie de Paris'
- _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)'
+ _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)'
_TESTS = [{
'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html',
'info_dict': {
diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py
index 788411c..6c8bbe1 100644
--- a/youtube_dl/extractor/photobucket.py
+++ b/youtube_dl/extractor/photobucket.py
@@ -8,7 +8,7 @@ from ..compat import compat_urllib_parse_unquote
class PhotobucketIE(InfoExtractor):
- _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
+ _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
_TEST = {
'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py
deleted file mode 100644
index 06505e9..0000000
--- a/youtube_dl/extractor/planetaplay.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import ExtractorError
-
-
-class PlanetaPlayIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?planetaplay\.com/\?sng=(?P<id>[0-9]+)'
- _API_URL = 'http://planetaplay.com/action/playlist/?sng={0:}'
- _THUMBNAIL_URL = 'http://planetaplay.com/img/thumb/{thumb:}'
- _TEST = {
- 'url': 'http://planetaplay.com/?sng=3586',
- 'md5': '9d569dceb7251a4e01355d5aea60f9db',
- 'info_dict': {
- 'id': '3586',
- 'ext': 'flv',
- 'title': 'md5:e829428ee28b1deed00de90de49d1da1',
- },
- 'skip': 'Not accessible from Travis CI server',
- }
-
- _SONG_FORMATS = {
- 'lq': (0, 'http://www.planetaplay.com/videoplayback/{med_hash:}'),
- 'hq': (1, 'http://www.planetaplay.com/videoplayback/hi/{med_hash:}'),
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- response = self._download_json(
- self._API_URL.format(video_id), video_id)['response']
- try:
- data = response.get('data')[0]
- except IndexError:
- raise ExtractorError(
- '%s: failed to get the playlist' % self.IE_NAME, expected=True)
-
- title = '{song_artists:} - {sng_name:}'.format(**data)
- thumbnail = self._THUMBNAIL_URL.format(**data)
-
- formats = []
- for format_id, (quality, url_template) in self._SONG_FORMATS.items():
- formats.append({
- 'format_id': format_id,
- 'url': url_template.format(**data),
- 'quality': quality,
- 'ext': 'flv',
- })
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- }
diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py
index 2856af9..57c875e 100644
--- a/youtube_dl/extractor/played.py
+++ b/youtube_dl/extractor/played.py
@@ -5,10 +5,10 @@ import re
import os.path
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
sanitized_Request,
+ urlencode_postdata,
)
@@ -40,7 +40,7 @@ class PlayedIE(InfoExtractor):
self._sleep(2, video_id)
- post = compat_urllib_parse.urlencode(data)
+ post = urlencode_postdata(data)
headers = {
b'Content-Type': b'application/x-www-form-urlencoded',
}
diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py
index e360404..1e8096a 100644
--- a/youtube_dl/extractor/playtvak.py
+++ b/youtube_dl/extractor/playtvak.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_urlparse,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
@@ -106,7 +106,7 @@ class PlaytvakIE(InfoExtractor):
})
info_url = compat_urlparse.urlunparse(
- parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+ parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
json_info = self._download_json(
info_url, video_id,
diff --git a/youtube_dl/extractor/playwire.py b/youtube_dl/extractor/playwire.py
index 6d138ef..0bc7431 100644
--- a/youtube_dl/extractor/playwire.py
+++ b/youtube_dl/extractor/playwire.py
@@ -4,9 +4,8 @@ import re
from .common import InfoExtractor
from ..utils import (
- xpath_text,
+ dict_get,
float_or_none,
- int_or_none,
)
@@ -23,6 +22,19 @@ class PlaywireIE(InfoExtractor):
'duration': 145.94,
},
}, {
+ # m3u8 in f4m
+ 'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json',
+ 'info_dict': {
+ 'id': '4840492',
+ 'ext': 'mp4',
+ 'title': 'ITV EL SHOW FULL',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # Multiple resolutions while bitrates missing
'url': 'http://cdn.playwire.com/11625/embed/85228.html',
'only_matching': True,
}, {
@@ -48,25 +60,10 @@ class PlaywireIE(InfoExtractor):
thumbnail = content.get('poster')
src = content['media']['f4m']
- f4m = self._download_xml(src, video_id)
- base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True)
- formats = []
- for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'):
- media_url = media.get('url')
- if not media_url:
- continue
- tbr = int_or_none(media.get('bitrate'))
- width = int_or_none(media.get('width'))
- height = int_or_none(media.get('height'))
- f = {
- 'url': '%s/%s' % (base_url, media.attrib['url']),
- 'tbr': tbr,
- 'width': width,
- 'height': height,
- }
- if not (tbr or width or height):
- f['quality'] = 1 if '-hd.' in media_url else 0
- formats.append(f)
+ formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls')
+ for a_format in formats:
+ if not dict_get(a_format, ['tbr', 'width', 'height']):
+ a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
index 12e1c28..9aab776 100644
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -8,7 +8,6 @@ import collections
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urllib_parse,
compat_urlparse,
)
from ..utils import (
@@ -17,6 +16,7 @@ from ..utils import (
parse_duration,
qualities,
sanitized_Request,
+ urlencode_postdata,
)
@@ -64,8 +64,8 @@ class PluralsightIE(PluralsightBaseIE):
login_form = self._hidden_inputs(login_page)
login_form.update({
- 'Username': username.encode('utf-8'),
- 'Password': password.encode('utf-8'),
+ 'Username': username,
+ 'Password': password,
})
post_url = self._search_regex(
@@ -76,7 +76,7 @@ class PluralsightIE(PluralsightBaseIE):
post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
request = sanitized_Request(
- post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ post_url, urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
response = self._download_webpage(
@@ -279,13 +279,18 @@ class PluralsightCourseIE(PluralsightBaseIE):
course_id, 'Downloading course data JSON')
entries = []
- for module in course_data:
+ for num, module in enumerate(course_data, 1):
for clip in module.get('clips', []):
player_parameters = clip.get('playerParameters')
if not player_parameters:
continue
- entries.append(self.url_result(
- '%s/training/player?%s' % (self._API_BASE, player_parameters),
- 'Pluralsight'))
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': '%s/training/player?%s' % (self._API_BASE, player_parameters),
+ 'ie_key': PluralsightIE.ie_key(),
+ 'chapter': module.get('title'),
+ 'chapter_number': num,
+ 'chapter_id': module.get('moduleRef'),
+ })
return self.playlist_result(entries, course_id, title, description)
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
index 3e15533..9894f32 100644
--- a/youtube_dl/extractor/porn91.py
+++ b/youtube_dl/extractor/porn91.py
@@ -1,7 +1,10 @@
# encoding: utf-8
from __future__ import unicode_literals
-from ..compat import compat_urllib_parse
+from ..compat import (
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlencode,
+)
from .common import InfoExtractor
from ..utils import (
parse_duration,
@@ -28,9 +31,10 @@ class Porn91IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id
self._set_cookie('91porn.com', 'language', 'cn_CN')
- webpage = self._download_webpage(url, video_id, 'get HTML content')
+
+ webpage = self._download_webpage(
+ 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id)
if '作为游客,你每天只可观看10个视频' in webpage:
raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True)
@@ -46,7 +50,7 @@ class Porn91IE(InfoExtractor):
r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
max_vid = self._search_regex(
r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
- url_params = compat_urllib_parse.urlencode({
+ url_params = compat_urllib_parse_urlencode({
'VID': file_id,
'mp4': '1',
'seccode': sec_code,
@@ -54,8 +58,9 @@ class Porn91IE(InfoExtractor):
})
info_cn = self._download_webpage(
'http://91porn.com/getfile.php?' + url_params, video_id,
- 'get real video url')
- video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url')
+ 'Downloading real video url')
+ video_url = compat_urllib_parse_unquote(self._search_regex(
+ r'file=([^&]+)&', info_cn, 'url'))
duration = parse_duration(self._search_regex(
r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 57c78ba..8df12ee 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -1,19 +1,32 @@
from __future__ import unicode_literals
import re
-import json
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
int_or_none,
js_to_json,
- qualities,
)
class PornHdIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?'
+ _TESTS = [{
+ 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
+ 'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5',
+ 'info_dict': {
+ 'id': '9864',
+ 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video',
+ 'ext': 'mp4',
+ 'title': 'Restroom selfie masturbation',
+ 'description': 'md5:3748420395e03e31ac96857a8f125b2b',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
+ 'age_limit': 18,
+ }
+ }, {
+ # removed video
'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
'md5': '956b8ca569f7f4d8ec563e2c41598441',
'info_dict': {
@@ -25,8 +38,9 @@ class PornHdIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg',
'view_count': int,
'age_limit': 18,
- }
- }
+ },
+ 'skip': 'Not available anymore',
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -38,28 +52,38 @@ class PornHdIE(InfoExtractor):
title = self._html_search_regex(
[r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)',
r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
- description = self._html_search_regex(
- r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
- view_count = int_or_none(self._html_search_regex(
- r'(\d+) views\s*</span>', webpage, 'view count', fatal=False))
- thumbnail = self._search_regex(
- r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
- quality = qualities(['sd', 'hd'])
- sources = json.loads(js_to_json(self._search_regex(
+ sources = self._parse_json(js_to_json(self._search_regex(
r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]",
- webpage, 'sources')))
+ webpage, 'sources', default='{}')), video_id)
+
+ if not sources:
+ message = self._html_search_regex(
+ r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P<value>.+?)</\1',
+ webpage, 'error message', group='value')
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+
formats = []
- for qname, video_url in sources.items():
+ for format_id, video_url in sources.items():
if not video_url:
continue
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_id, 'height', default=None))
formats.append({
'url': video_url,
- 'format_id': qname,
- 'quality': quality(qname),
+ 'format_id': format_id,
+ 'height': height,
})
self._sort_formats(formats)
+ description = self._html_search_regex(
+ r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1',
+ webpage, 'description', fatal=False, group='value')
+ view_count = int_or_none(self._html_search_regex(
+ r'(\d+) views\s*<', webpage, 'view count', fatal=False))
+ thumbnail = self._search_regex(
+ r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
+
return {
'id': video_id,
'display_id': display_id,
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 5a55c25..6d57e1d 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -1,10 +1,13 @@
+# coding: utf-8
from __future__ import unicode_literals
+import itertools
import os
import re
from .common import InfoExtractor
from ..compat import (
+ compat_HTTPError,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlparse,
@@ -12,6 +15,7 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
+ orderedSet,
sanitized_Request,
str_to_int,
)
@@ -36,7 +40,25 @@ class PornHubIE(InfoExtractor):
'dislike_count': int,
'comment_count': int,
'age_limit': 18,
- }
+ },
+ }, {
+ # non-ASCII title
+ 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
+ 'info_dict': {
+ 'id': '1331683002',
+ 'ext': 'mp4',
+ 'title': '重庆婷婷女王足交',
+ 'uploader': 'cj397186295',
+ 'duration': 1753,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True,
@@ -73,19 +95,25 @@ class PornHubIE(InfoExtractor):
'PornHub said: %s' % error_msg,
expected=True, video_id=video_id)
+ # video_title from flashvars contains whitespace instead of non-ASCII (see
+ # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
+ # on that anymore.
+ title = self._html_search_meta(
+ 'twitter:title', webpage, default=None) or self._search_regex(
+ (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
+ r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
+ r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
+ webpage, 'title', group='title')
+
flashvars = self._parse_json(
self._search_regex(
- r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
+ r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
video_id)
if flashvars:
- video_title = flashvars.get('video_title')
thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration'))
else:
- video_title, thumbnail, duration = [None] * 3
-
- if not video_title:
- video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
+ title, thumbnail, duration = [None] * 3
video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
@@ -134,7 +162,7 @@ class PornHubIE(InfoExtractor):
return {
'id': video_id,
'uploader': video_uploader,
- 'title': video_title,
+ 'title': title,
'thumbnail': thumbnail,
'duration': duration,
'view_count': view_count,
@@ -149,9 +177,12 @@ class PornHubIE(InfoExtractor):
class PornHubPlaylistBaseIE(InfoExtractor):
def _extract_entries(self, webpage):
return [
- self.url_result('http://www.pornhub.com/%s' % video_url, PornHubIE.ie_key())
- for video_url in set(re.findall(
- r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"', webpage))
+ self.url_result(
+ 'http://www.pornhub.com/%s' % video_url,
+ PornHubIE.ie_key(), video_title=title)
+ for video_url, title in orderedSet(re.findall(
+ r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
+ webpage))
]
def _real_extract(self, url):
@@ -185,16 +216,31 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE):
class PornHubUserVideosIE(PornHubPlaylistBaseIE):
_VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'
_TESTS = [{
- 'url': 'http://www.pornhub.com/users/rushandlia/videos',
+ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
'info_dict': {
- 'id': 'rushandlia',
+ 'id': 'zoe_ph',
},
- 'playlist_mincount': 13,
+ 'playlist_mincount': 171,
+ }, {
+ 'url': 'http://www.pornhub.com/users/rushandlia/videos',
+ 'only_matching': True,
}]
def _real_extract(self, url):
user_id = self._match_id(url)
- webpage = self._download_webpage(url, user_id)
-
- return self.playlist_result(self._extract_entries(webpage), user_id)
+ entries = []
+ for page_num in itertools.count(1):
+ try:
+ webpage = self._download_webpage(
+ url, user_id, 'Downloading page %d' % page_num,
+ query={'page': page_num})
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ break
+ page_entries = self._extract_entries(webpage)
+ if not page_entries:
+ break
+ entries.extend(page_entries)
+
+ return self.playlist_result(entries, user_id)
diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py
index 1a53fd7..6b51e5c 100644
--- a/youtube_dl/extractor/pornovoisines.py
+++ b/youtube_dl/extractor/pornovoisines.py
@@ -13,7 +13,7 @@ from ..utils import (
class PornoVoisinesIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)'
_VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \
'/static/media/video/transcoded/%s-640x360-1000-trscded.mp4'
diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py
new file mode 100644
index 0000000..2da93ed
--- /dev/null
+++ b/youtube_dl/extractor/presstv.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import remove_start
+
+
+class PressTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P<y>\d+)/(?P<m>\d+)/(?P<d>\d+)/(?P<id>\d+)/(?P<display_id>[^/]+)?'
+
+ _TEST = {
+ 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/',
+ 'md5': '5d7e3195a447cb13e9267e931d8dd5a5',
+ 'info_dict': {
+ 'id': '459911',
+ 'display_id': 'Australian-sewerage-treatment-facility-',
+ 'ext': 'mp4',
+ 'title': 'Organic mattresses used to clean waste water',
+ 'upload_date': '20160409',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'description': 'md5:20002e654bbafb6908395a5c0cfcd125'
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ display_id = mobj.group('display_id') or video_id
+
+ webpage = self._download_webpage(url, display_id)
+
+ # extract video URL from webpage
+ video_url = self._hidden_inputs(webpage)['inpPlayback']
+
+ # build list of available formats
+ # specified in http://www.presstv.ir/Scripts/playback.js
+ base_url = 'http://192.99.219.222:82/presstv'
+ _formats = [
+ (180, '_low200.mp4'),
+ (360, '_low400.mp4'),
+ (720, '_low800.mp4'),
+ (1080, '.mp4')
+ ]
+
+ formats = [{
+ 'url': base_url + video_url[:-4] + extension,
+ 'format_id': '%dp' % height,
+ 'height': height,
+ } for height, extension in _formats]
+
+ # extract video metadata
+ title = remove_start(
+ self._html_search_meta('title', webpage, fatal=True), 'PressTV-')
+
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._og_search_description(webpage)
+
+ upload_date = '%04d%02d%02d' % (
+ int(mobj.group('y')),
+ int(mobj.group('m')),
+ int(mobj.group('d')),
+ )
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'description': description
+ }
diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py
index 85aae95..0c10247 100644
--- a/youtube_dl/extractor/primesharetv.py
+++ b/youtube_dl/extractor/primesharetv.py
@@ -1,10 +1,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
sanitized_Request,
+ urlencode_postdata,
)
@@ -42,7 +42,7 @@ class PrimeShareTVIE(InfoExtractor):
self._sleep(wait_time, video_id)
req = sanitized_Request(
- url, compat_urllib_parse.urlencode(fields), headers)
+ url, urlencode_postdata(fields), headers)
video_page = self._download_webpage(
req, video_id, 'Downloading video page')
diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py
index d535728..f93bd19 100644
--- a/youtube_dl/extractor/promptfile.py
+++ b/youtube_dl/extractor/promptfile.py
@@ -4,11 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
determine_ext,
ExtractorError,
sanitized_Request,
+ urlencode_postdata,
)
@@ -34,7 +34,7 @@ class PromptFileIE(InfoExtractor):
expected=True)
fields = self._hidden_inputs(webpage)
- post = compat_urllib_parse.urlencode(fields)
+ post = urlencode_postdata(fields)
req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
index 670e695..07d49d4 100644
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@@ -5,9 +5,7 @@ import re
from hashlib import sha1
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
-)
+from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
determine_ext,
@@ -235,7 +233,7 @@ class ProSiebenSat1IE(InfoExtractor):
client_name = 'kolibri-2.0.19-splec4'
client_location = url
- videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
+ videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({
'access_token': access_token,
'client_location': client_location,
'client_name': client_name,
@@ -256,7 +254,7 @@ class ProSiebenSat1IE(InfoExtractor):
client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
.encode('utf-8')).hexdigest()
- sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
+ sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({
'access_token': access_token,
'client_id': client_id,
'client_location': client_location,
@@ -270,7 +268,7 @@ class ProSiebenSat1IE(InfoExtractor):
client_location, source_ids_str, g, client_name])
.encode('utf-8')).hexdigest()
- url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
+ url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({
'access_token': access_token,
'client_id': client_id,
'client_location': client_location,
diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py
index cce84b9..fca30e1 100644
--- a/youtube_dl/extractor/puls4.py
+++ b/youtube_dl/extractor/puls4.py
@@ -40,7 +40,7 @@ class Puls4IE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
error_message = self._html_search_regex(
- r'<div class="message-error">(.+?)</div>',
+ r'<div[^>]+class="message-error"[^>]*>(.+?)</div>',
webpage, 'error message', default=None)
if error_message:
raise ExtractorError(
diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py
index 6d5732d..cc0416c 100644
--- a/youtube_dl/extractor/pyvideo.py
+++ b/youtube_dl/extractor/pyvideo.py
@@ -7,19 +7,19 @@ from .common import InfoExtractor
class PyvideoIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
+ _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
_TESTS = [
{
'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
- 'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+ 'md5': '520915673e53a5c5d487c36e0c4d85b5',
'info_dict': {
'id': '24_4WWkSmNo',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Become a logging expert in 30 minutes',
'description': 'md5:9665350d466c67fb5b1598de379021f7',
'upload_date': '20130320',
- 'uploader': 'NextDayVideo',
+ 'uploader': 'Next Day Video',
'uploader_id': 'NextDayVideo',
},
'add_ie': ['Youtube'],
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 45a3c41..ff0af95 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -18,7 +18,7 @@ from ..utils import (
class QQMusicIE(InfoExtractor):
IE_NAME = 'qqmusic'
IE_DESC = 'QQ音乐'
- _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
'md5': '9ce1c1c8445f561506d2e3cfb0255705',
@@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor):
class QQMusicSingerIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:singer'
IE_DESC = 'QQ音乐 - 歌手'
- _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
_TEST = {
'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
'info_dict': {
@@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
class QQMusicAlbumIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:album'
IE_DESC = 'QQ音乐 - 专辑'
- _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
@@ -260,7 +260,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
class QQMusicToplistIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:toplist'
IE_DESC = 'QQ音乐 - 排行榜'
- _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+ _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=toplist&p=global_123',
@@ -314,7 +314,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
class QQMusicPlaylistIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:playlist'
IE_DESC = 'QQ音乐 - 歌单'
- _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://y.qq.com/#type=taoge&id=3462654915',
diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py
deleted file mode 100644
index f414e23..0000000
--- a/youtube_dl/extractor/quickvid.py
+++ /dev/null
@@ -1,54 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_urlparse,
-)
-from ..utils import (
- determine_ext,
- int_or_none,
-)
-
-
-class QuickVidIE(InfoExtractor):
- _VALID_URL = r'https?://(www\.)?quickvid\.org/watch\.php\?v=(?P<id>[a-zA-Z_0-9-]+)'
- _TEST = {
- 'url': 'http://quickvid.org/watch.php?v=sUQT3RCG8dx',
- 'md5': 'c0c72dd473f260c06c808a05d19acdc5',
- 'info_dict': {
- 'id': 'sUQT3RCG8dx',
- 'ext': 'mp4',
- 'title': 'Nick Offerman\'s Summer Reading Recap',
- 'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$',
- 'view_count': int,
- },
- 'skip': 'Not accessible from Travis CI server',
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- title = self._html_search_regex(r'<h2>(.*?)</h2>', webpage, 'title')
- view_count = int_or_none(self._html_search_regex(
- r'(?s)<div id="views">(.*?)</div>',
- webpage, 'view count', fatal=False))
- video_code = self._search_regex(
- r'(?s)<video id="video"[^>]*>(.*?)</video>', webpage, 'video code')
- formats = [
- {
- 'url': compat_urlparse.urljoin(url, src),
- 'format_id': determine_ext(src, None),
- } for src in re.findall('<source\s+src="([^"]+)"', video_code)
- ]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'view_count': view_count,
- }
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
index 976c8fe..069dbfa 100644
--- a/youtube_dl/extractor/r7.py
+++ b/youtube_dl/extractor/r7.py
@@ -2,22 +2,19 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- js_to_json,
- unescapeHTML,
- int_or_none,
-)
+from ..utils import int_or_none
class R7IE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
+ _VALID_URL = r'''(?x)
+ https?://
(?:
(?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/|
noticias\.r7\.com(?:/[^/]+)+/[^/]+-|
player\.r7\.com/video/i/
)
(?P<id>[\da-f]{24})
- '''
+ '''
_TESTS = [{
'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html',
'md5': '403c4e393617e8e8ddc748978ee8efde',
@@ -25,6 +22,7 @@ class R7IE(InfoExtractor):
'id': '54e7050b0cf2ff57e0279389',
'ext': 'mp4',
'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"',
+ 'description': 'md5:01812008664be76a6479aa58ec865b72',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 98,
'like_count': int,
@@ -44,45 +42,72 @@ class R7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'http://player.r7.com/video/i/%s' % video_id, video_id)
+ video = self._download_json(
+ 'http://player-api.r7.com/video/i/%s' % video_id, video_id)
- item = self._parse_json(js_to_json(self._search_regex(
- r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id)
-
- title = unescapeHTML(item['title'])
- thumbnail = item.get('init', {}).get('thumbUri')
- duration = None
-
- statistics = item.get('statistics', {})
- like_count = int_or_none(statistics.get('likes'))
- view_count = int_or_none(statistics.get('views'))
+ title = video['title']
formats = []
- for format_key, format_dict in item['playlist'][0].items():
- src = format_dict.get('src')
- if not src:
- continue
- format_id = format_dict.get('format') or format_key
- if duration is None:
- duration = format_dict.get('duration')
- if '.f4m' in src:
- formats.extend(self._extract_f4m_formats(src, video_id, preference=-1))
- elif src.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2))
- else:
- formats.append({
- 'url': src,
- 'format_id': format_id,
- })
+ media_url_hls = video.get('media_url_hls')
+ if media_url_hls:
+ formats.extend(self._extract_m3u8_formats(
+ media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ media_url = video.get('media_url')
+ if media_url:
+ f = {
+ 'url': media_url,
+ 'format_id': 'http',
+ }
+ # m3u8 format always matches the http format, let's copy metadata from
+ # one to another
+ m3u8_formats = list(filter(
+ lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ formats))
+ if len(m3u8_formats) == 1:
+ f_copy = m3u8_formats[0].copy()
+ f_copy.update(f)
+ f_copy['protocol'] = 'http'
+ f = f_copy
+ formats.append(f)
self._sort_formats(formats)
+ description = video.get('description')
+ thumbnail = video.get('thumb')
+ duration = int_or_none(video.get('media_duration'))
+ like_count = int_or_none(video.get('likes'))
+ view_count = int_or_none(video.get('views'))
+
return {
'id': video_id,
'title': title,
+ 'description': description,
'thumbnail': thumbnail,
'duration': duration,
'like_count': like_count,
'view_count': view_count,
'formats': formats,
}
+
+
+class R7ArticleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015',
+ 'only_matching': True,
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ video_id = self._search_regex(
+ r'<div[^>]+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})',
+ webpage, 'video id')
+
+ return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key())
diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py
new file mode 100644
index 0000000..4f05bbd
--- /dev/null
+++ b/youtube_dl/extractor/radiocanada.py
@@ -0,0 +1,130 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ xpath_text,
+ find_xpath_attr,
+ determine_ext,
+ int_or_none,
+ unified_strdate,
+ xpath_element,
+ ExtractorError,
+)
+
+
+class RadioCanadaIE(InfoExtractor):
+ IE_NAME = 'radiocanada'
+ _VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+ 'info_dict': {
+ 'id': '7184272',
+ 'ext': 'flv',
+ 'title': 'Le parcours du tireur capté sur vidéo',
+ 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+ 'upload_date': '20141023',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ app_code, video_id = re.match(self._VALID_URL, url).groups()
+
+ formats = []
+ # TODO: extract m3u8 and f4m formats
+ # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements
+ # f4m formats can be extracted using flashhd device_type but they produce unplayable file
+ for device_type in ('flash',):
+ v_data = self._download_xml(
+ 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
+ video_id, note='Downloading %s XML' % device_type, query={
+ 'appCode': app_code,
+ 'idMedia': video_id,
+ 'connectionType': 'broadband',
+ 'multibitrate': 'true',
+ 'deviceType': device_type,
+ # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
+ 'paysJ391wsHjbOJwvCs26toz': 'CA',
+ 'bypasslock': 'NZt5K62gRqfc',
+ })
+ v_url = xpath_text(v_data, 'url')
+ if not v_url:
+ continue
+ if v_url == 'null':
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
+ ext = determine_ext(v_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False))
+ else:
+ ext = determine_ext(v_url)
+ bitrates = xpath_element(v_data, 'bitrates')
+ for url_e in bitrates.findall('url'):
+ tbr = int_or_none(url_e.get('bitrate'))
+ if not tbr:
+ continue
+ formats.append({
+ 'format_id': 'rtmp-%d' % tbr,
+ 'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url),
+ 'ext': 'flv',
+ 'protocol': 'rtmp',
+ 'width': int_or_none(url_e.get('width')),
+ 'height': int_or_none(url_e.get('height')),
+ 'tbr': tbr,
+ })
+ self._sort_formats(formats)
+
+ metadata = self._download_xml(
+ 'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
+ video_id, note='Downloading metadata XML', query={
+ 'appCode': app_code,
+ 'idMedia': video_id,
+ })
+
+ def get_meta(name):
+ el = find_xpath_attr(metadata, './/Meta', 'name', name)
+ return el.text if el is not None else None
+
+ return {
+ 'id': video_id,
+ 'title': get_meta('Title'),
+ 'description': get_meta('Description') or get_meta('ShortDescription'),
+ 'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
+ 'duration': int_or_none(get_meta('length')),
+ 'series': get_meta('Emission'),
+ 'season_number': int_or_none('SrcSaison'),
+ 'episode_number': int_or_none('SrcEpisode'),
+ 'upload_date': unified_strdate(get_meta('Date')),
+ 'formats': formats,
+ }
+
+
+class RadioCanadaAudioVideoIE(InfoExtractor):
+ 'radiocanada:audiovideo'
+ _VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
+ 'info_dict': {
+ 'id': '7527184',
+ 'ext': 'flv',
+ 'title': 'Barack Obama au Vietnam',
+ 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
+ 'upload_date': '20160523',
+ },
+ 'params': {
+ # rtmp download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ return self.url_result('radiocanada:medianet:%s' % self._match_id(url))
diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py
index 884c284..ec4fa6e 100644
--- a/youtube_dl/extractor/radiojavan.py
+++ b/youtube_dl/extractor/radiojavan.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import(
+from ..utils import (
unified_strdate,
str_to_int,
)
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index a4dc5c3..e36ce1a 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -18,7 +18,7 @@ from ..utils import (
class RaiTVIE(InfoExtractor):
- _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
+ _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
_TESTS = [
{
'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
@@ -175,7 +175,7 @@ class RaiTVIE(InfoExtractor):
class RaiIE(InfoExtractor):
- _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
+ _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
_TESTS = [
{
'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py
index d6054d7..721fc3a 100644
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -1,11 +1,16 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ str_to_int,
+ unified_strdate,
+)
class RedTubeIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.redtube.com/66418',
'md5': '7b8c22b5e7098a3e1c09709df1126d2d',
@@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
'id': '66418',
'ext': 'mp4',
'title': 'Sucked on a toilet',
+ 'upload_date': '20120831',
+ 'duration': 596,
+ 'view_count': int,
'age_limit': 18,
}
}
@@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):
if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
raise ExtractorError('Video %s has been removed' % video_id, expected=True)
- video_url = self._html_search_regex(
- r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
- video_title = self._html_search_regex(
- r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
- webpage, 'title')
- video_thumbnail = self._og_search_thumbnail(webpage)
+ title = self._html_search_regex(
+ (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
+ r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
+ webpage, 'title', group='title')
+
+ formats = []
+ sources = self._parse_json(
+ self._search_regex(
+ r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
+ video_id, fatal=False)
+ if sources and isinstance(sources, dict):
+ for format_id, format_url in sources.items():
+ if format_url:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': int_or_none(format_id),
+ })
+ else:
+ video_url = self._html_search_regex(
+ r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
+ formats.append({'url': video_url})
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+ upload_date = unified_strdate(self._search_regex(
+ r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
+ webpage, 'upload date', fatal=False))
+ duration = int_or_none(self._search_regex(
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+ view_count = str_to_int(self._search_regex(
+ r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
+ webpage, 'view count', fatal=False))
# No self-labeling, but they describe themselves as
# "Home of Videos Porno"
@@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):
return {
'id': video_id,
- 'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'thumbnail': video_thumbnail,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'duration': duration,
+ 'view_count': view_count,
'age_limit': age_limit,
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/restudy.py b/youtube_dl/extractor/restudy.py
index b17c2bf..fd50065 100644
--- a/youtube_dl/extractor/restudy.py
+++ b/youtube_dl/extractor/restudy.py
@@ -31,6 +31,7 @@ class RestudyIE(InfoExtractor):
formats = self._extract_smil_formats(
'https://www.restudy.dk/awsmedia/SmilDirectory/video_%s.xml' % video_id,
video_id)
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/reuters.py b/youtube_dl/extractor/reuters.py
new file mode 100644
index 0000000..961d504
--- /dev/null
+++ b/youtube_dl/extractor/reuters.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ js_to_json,
+ int_or_none,
+ unescapeHTML,
+)
+
+
+class ReutersIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562',
+ 'md5': '8015113643a0b12838f160b0b81cc2ee',
+ 'info_dict': {
+ 'id': '368575562',
+ 'ext': 'mp4',
+ 'title': 'San Francisco police chief resigns',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(
+ 'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id)
+ video_data = js_to_json(self._search_regex(
+ r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);',
+ webpage, 'video data'))
+
+ def get_json_value(key, fatal=False):
+ return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal)
+
+ title = unescapeHTML(get_json_value('title', fatal=True))
+ mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups()
+
+ mas_data = self._download_json(
+ 'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid),
+ video_id, transform_source=js_to_json)
+ formats = []
+ for f in mas_data:
+ f_url = f.get('url')
+ if not f_url:
+ continue
+ method = f.get('method')
+ if method == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ container = f.get('container')
+ ext = '3gp' if method == 'mobile' else container
+ formats.append({
+ 'format_id': ext,
+ 'url': f_url,
+ 'ext': ext,
+ 'container': container if method != 'mobile' else None,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': get_json_value('thumb'),
+ 'duration': int_or_none(get_json_value('seconds')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py
index b1b8800..833d8a2 100644
--- a/youtube_dl/extractor/revision3.py
+++ b/youtube_dl/extractor/revision3.py
@@ -13,13 +13,69 @@ from ..utils import (
)
+class Revision3EmbedIE(InfoExtractor):
+ IE_NAME = 'revision3:embed'
+ _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)'
+ _TEST = {
+ 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558',
+ 'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+ 'info_dict': {
+ 'id': '67558',
+ 'ext': 'mp4',
+ 'title': 'The Pros & Cons Of Zoos',
+ 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
+ 'uploader_id': 'dnews',
+ 'uploader': 'DNews',
+ }
+ }
+ _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('playlist_id')
+ playlist_type = mobj.group('playlist_type') or 'video_id'
+ video_data = self._download_json(
+ 'http://revision3.com/api/getPlaylist.json', playlist_id, query={
+ 'api_key': self._API_KEY,
+ 'codecs': 'h264,vp8,theora',
+ playlist_type: playlist_id,
+ })['items'][0]
+
+ formats = []
+ for vcodec, media in video_data['media'].items():
+ for quality_id, quality in media.items():
+ if quality_id == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ quality['url'], playlist_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': quality['url'],
+ 'format_id': '%s-%s' % (vcodec, quality_id),
+ 'tbr': int_or_none(quality.get('bitrate')),
+ 'vcodec': vcodec,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': playlist_id,
+ 'title': unescapeHTML(video_data['title']),
+ 'description': unescapeHTML(video_data.get('summary')),
+ 'uploader': video_data.get('show', {}).get('name'),
+ 'uploader_id': video_data.get('show', {}).get('slug'),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'formats': formats,
+ }
+
+
class Revision3IE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|testtube|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
+ IE_NAME = 'revision'
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
_TESTS = [{
'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
'md5': 'd94a72d85d0a829766de4deb8daaf7df',
'info_dict': {
- 'id': '73034',
+ 'id': '71089',
'display_id': 'technobuffalo/5-google-predictions-for-2016',
'ext': 'webm',
'title': '5 Google Predictions for 2016',
@@ -31,89 +87,76 @@ class Revision3IE(InfoExtractor):
'uploader_id': 'technobuffalo',
}
}, {
- 'url': 'http://testtube.com/brainstuff',
- 'info_dict': {
- 'id': '251',
- 'title': 'BrainStuff',
- 'description': 'Whether the topic is popcorn or particle physics, you can count on the HowStuffWorks team to explore-and explain-the everyday science in the world around us on BrainStuff.',
- },
- 'playlist_mincount': 93,
+ # Show
+ 'url': 'http://revision3.com/variant',
+ 'only_matching': True,
}, {
- 'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',
- 'info_dict': {
- 'id': '60163',
- 'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',
- 'duration': 275,
- 'ext': 'webm',
- 'title': '5 Weird Ways Plants Can Eat Animals',
- 'description': 'Why have some plants evolved to eat meat?',
- 'upload_date': '20150120',
- 'timestamp': 1421763300,
- 'uploader': 'DNews',
- 'uploader_id': 'dnews',
- },
+ # Tag
+ 'url': 'http://revision3.com/vr',
+ 'only_matching': True,
}]
_PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
- _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
def _real_extract(self, url):
domain, display_id = re.match(self._VALID_URL, url).groups()
+ site = domain.split('.')[0]
page_info = self._download_json(
self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id)
- if page_info['data']['type'] == 'episode':
- episode_data = page_info['data']
- video_id = compat_str(episode_data['video']['data']['id'])
- video_data = self._download_json(
- 'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),
- video_id)['items'][0]
-
- formats = []
- for vcodec, media in video_data['media'].items():
- for quality_id, quality in media.items():
- if quality_id == 'hls':
- formats.extend(self._extract_m3u8_formats(
- quality['url'], video_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal=False))
- else:
- formats.append({
- 'url': quality['url'],
- 'format_id': '%s-%s' % (vcodec, quality_id),
- 'tbr': int_or_none(quality.get('bitrate')),
- 'vcodec': vcodec,
- })
- self._sort_formats(formats)
+ page_data = page_info['data']
+ page_type = page_data['type']
+ if page_type in ('episode', 'embed'):
+ show_data = page_data['show']['data']
+ page_id = compat_str(page_data['id'])
+ video_id = compat_str(page_data['video']['data']['id'])
preference = qualities(['mini', 'small', 'medium', 'large'])
thumbnails = [{
'url': image_url,
'id': image_id,
'preference': preference(image_id)
- } for image_id, image_url in video_data.get('images', {}).items()]
+ } for image_id, image_url in page_data.get('images', {}).items()]
- return {
- 'id': video_id,
+ info = {
+ 'id': page_id,
'display_id': display_id,
- 'title': unescapeHTML(video_data['title']),
- 'description': unescapeHTML(video_data.get('summary')),
- 'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '),
- 'author': episode_data.get('author'),
- 'uploader': video_data.get('show', {}).get('name'),
- 'uploader_id': video_data.get('show', {}).get('slug'),
- 'duration': int_or_none(video_data.get('duration')),
+ 'title': unescapeHTML(page_data['name']),
+ 'description': unescapeHTML(page_data.get('summary')),
+ 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '),
+ 'author': page_data.get('author'),
+ 'uploader': show_data.get('name'),
+ 'uploader_id': show_data.get('slug'),
'thumbnails': thumbnails,
- 'formats': formats,
+ 'extractor_key': site,
}
+
+ if page_type == 'embed':
+ info.update({
+ '_type': 'url_transparent',
+ 'url': page_data['video']['data']['embed'],
+ })
+ return info
+
+ info.update({
+ '_type': 'url_transparent',
+ 'url': 'revision3:%s' % video_id,
+ })
+ return info
else:
- show_data = page_info['show']['data']
+ list_data = page_info[page_type]['data']
episodes_data = page_info['episodes']['data']
num_episodes = page_info['meta']['totalEpisodes']
processed_episodes = 0
entries = []
page_num = 1
while True:
- entries.extend([self.url_result(
- 'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data])
+ entries.extend([{
+ '_type': 'url',
+ 'url': 'http://%s%s' % (domain, episode['path']),
+ 'id': compat_str(episode['id']),
+ 'ie_key': 'Revision3',
+ 'extractor_key': site,
+ } for episode in episodes_data])
processed_episodes += len(episodes_data)
if processed_episodes == num_episodes:
break
@@ -123,5 +166,5 @@ class Revision3IE(InfoExtractor):
display_id)['episodes']['data']
return self.playlist_result(
- entries, compat_str(show_data['id']),
- show_data.get('name'), show_data.get('summary'))
+ entries, compat_str(list_data['id']),
+ list_data.get('name'), list_data.get('summary'))
diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py
new file mode 100644
index 0000000..f855719
--- /dev/null
+++ b/youtube_dl/extractor/rice.py
@@ -0,0 +1,116 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_parse_qs
+from ..utils import (
+ xpath_text,
+ xpath_element,
+ int_or_none,
+ parse_iso8601,
+ ExtractorError,
+)
+
+
+class RICEIE(InfoExtractor):
+ _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)'
+ _TEST = {
+ 'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw',
+ 'md5': '9b83b4a2eead4912dc3b7fac7c449b6a',
+ 'info_dict': {
+ 'id': 'YEWIvbhb40aqdjMD1ALSqw',
+ 'ext': 'mp4',
+ 'title': 'Active Learning in Archeology',
+ 'upload_date': '20140616',
+ 'timestamp': 1402926346,
+ }
+ }
+ _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config'
+
+ def _real_extract(self, url):
+ qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query'))
+ if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'):
+ raise ExtractorError('Invalid URL', expected=True)
+
+ portal_id = qs['PortalID'][0]
+ playlist_id = qs['DestinationID'][0]
+ content_id = qs['ContentID'][0]
+
+ content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={
+ 'portalId': portal_id,
+ 'playlistId': playlist_id,
+ 'contentId': content_id
+ })
+ metadata = xpath_element(content_data, './/metaData', fatal=True)
+ title = xpath_text(metadata, 'primaryTitle', fatal=True)
+ encodings = xpath_element(content_data, './/encodings', fatal=True)
+ player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={
+ 'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True),
+ 'contentId': content_id,
+ })
+
+ common_fmt = {}
+ dimensions = xpath_text(encodings, 'dimensions')
+ if dimensions:
+ wh = dimensions.split('x')
+ if len(wh) == 2:
+ common_fmt.update({
+ 'width': int_or_none(wh[0]),
+ 'height': int_or_none(wh[1]),
+ })
+
+ formats = []
+ rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS))
+ if rtsp_path:
+ fmt = {
+ 'url': rtsp_path,
+ 'format_id': 'rtsp',
+ }
+ fmt.update(common_fmt)
+ formats.append(fmt)
+ for source in player_data.findall(self._xpath_ns('.//Source', self._NS)):
+ video_url = xpath_text(source, self._xpath_ns('File', self._NS))
+ if not video_url:
+ continue
+ if '.m3u8' in video_url:
+ formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+ else:
+ fmt = {
+ 'url': video_url,
+ 'format_id': video_url.split(':')[0],
+ }
+ fmt.update(common_fmt)
+ rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url)
+ if rtmp:
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ })
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for content_asset in content_data.findall('.//contentAssets'):
+ asset_type = xpath_text(content_asset, 'type')
+ if asset_type == 'image':
+ image_url = xpath_text(content_asset, 'httpPath')
+ if not image_url:
+ continue
+ thumbnails.append({
+ 'id': xpath_text(content_asset, 'ID'),
+ 'url': image_url,
+ })
+
+ return {
+ 'id': content_id,
+ 'title': title,
+ 'description': xpath_text(metadata, 'abstract'),
+ 'duration': int_or_none(xpath_text(metadata, 'duration')),
+ 'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py
index 5087580..2c2c707 100644
--- a/youtube_dl/extractor/ringtv.py
+++ b/youtube_dl/extractor/ringtv.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
class RingTVIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)'
_TEST = {
'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30',
'md5': 'd25945f5df41cdca2d2587165ac28720',
diff --git a/youtube_dl/extractor/rockstargames.py b/youtube_dl/extractor/rockstargames.py
new file mode 100644
index 0000000..48128e2
--- /dev/null
+++ b/youtube_dl/extractor/rockstargames.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class RockstarGamesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rockstargames\.com/videos(?:/video/|#?/?\?.*\bvideo=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.rockstargames.com/videos/video/11544/',
+ 'md5': '03b5caa6e357a4bd50e3143fc03e5733',
+ 'info_dict': {
+ 'id': '11544',
+ 'ext': 'mp4',
+ 'title': 'Further Adventures in Finance and Felony Trailer',
+ 'description': 'md5:6d31f55f30cb101b5476c4a379e324a3',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'timestamp': 1464876000,
+ 'upload_date': '20160602',
+ }
+ }, {
+ 'url': 'http://www.rockstargames.com/videos#/?video=48',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'https://www.rockstargames.com/videoplayer/videos/get-video.json',
+ video_id, query={
+ 'id': video_id,
+ 'locale': 'en_us',
+ })['video']
+
+ title = video['title']
+
+ formats = []
+ for video in video['files_processed']['video/mp4']:
+ if not video.get('src'):
+ continue
+ resolution = video.get('resolution')
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', resolution or '', 'height', default=None))
+ formats.append({
+ 'url': self._proto_relative_url(video['src']),
+ 'format_id': resolution,
+ 'height': height,
+ })
+
+ if not formats:
+ youtube_id = video.get('youtube_id')
+ if youtube_id:
+ return self.url_result(youtube_id, 'Youtube')
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'thumbnail': self._proto_relative_url(video.get('screencap')),
+ 'timestamp': parse_iso8601(video.get('created')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py
index e8bb20a..f9cd487 100644
--- a/youtube_dl/extractor/rottentomatoes.py
+++ b/youtube_dl/extractor/rottentomatoes.py
@@ -1,11 +1,11 @@
from __future__ import unicode_literals
-from .videodetective import VideoDetectiveIE
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from .internetvideoarchive import InternetVideoArchiveIE
-# It just uses the same method as videodetective.com,
-# the internetvideoarchive.com is extracted from the og:video property
-class RottenTomatoesIE(VideoDetectiveIE):
+class RottenTomatoesIE(InfoExtractor):
_VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)'
_TEST = {
@@ -13,7 +13,19 @@ class RottenTomatoesIE(VideoDetectiveIE):
'info_dict': {
'id': '613340',
'ext': 'mp4',
- 'title': 'TOY STORY 3',
- 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.',
+ 'title': 'Toy Story 3',
},
}
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ og_video = self._og_search_video_url(webpage)
+ query = compat_urlparse.urlparse(og_video).query
+
+ return {
+ '_type': 'url_transparent',
+ 'url': InternetVideoArchiveIE._build_xml_url(query),
+ 'ie_key': InternetVideoArchiveIE.ie_key(),
+ 'title': self._og_search_title(webpage),
+ }
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
index e42b319..28cc552 100644
--- a/youtube_dl/extractor/rtbf.py
+++ b/youtube_dl/extractor/rtbf.py
@@ -4,12 +4,18 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
- unescapeHTML,
+ ExtractorError,
)
class RTBFIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://(?:www\.)?rtbf\.be/
+ (?:
+ video/[^?]+\?.*\bid=|
+ ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=|
+ auvio/[^/]+\?.*id=
+ )(?P<id>\d+)'''
_TESTS = [{
'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
'md5': '799f334ddf2c0a582ba80c44655be570',
@@ -17,7 +23,11 @@ class RTBFIE(InfoExtractor):
'id': '1921274',
'ext': 'mp4',
'title': 'Les Diables au coeur (épisode 2)',
+ 'description': 'Football - Diables Rouges',
'duration': 3099,
+ 'upload_date': '20140425',
+ 'timestamp': 1398456336,
+ 'uploader': 'rtbfsport',
}
}, {
# geo restricted
@@ -26,45 +36,63 @@ class RTBFIE(InfoExtractor):
}, {
'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858',
'only_matching': True,
+ }, {
+ 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996',
+ 'only_matching': True,
}]
-
+ _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be'
+ _PROVIDERS = {
+ 'YOUTUBE': 'Youtube',
+ 'DAILYMOTION': 'Dailymotion',
+ 'VIMEO': 'Vimeo',
+ }
_QUALITIES = [
- ('mobile', 'mobile'),
- ('web', 'SD'),
- ('url', 'MD'),
+ ('mobile', 'SD'),
+ ('web', 'MD'),
('high', 'HD'),
]
def _real_extract(self, url):
video_id = self._match_id(url)
+ data = self._download_json(
+ 'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id)
- webpage = self._download_webpage(
- 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+ error = data.get('error')
+ if error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
- data = self._parse_json(
- unescapeHTML(self._search_regex(
- r'data-media="([^"]+)"', webpage, 'data video')),
- video_id)
+ data = data['data']
+
+ provider = data.get('provider')
+ if provider in self._PROVIDERS:
+ return self.url_result(data['url'], self._PROVIDERS[provider])
- if data.get('provider').lower() == 'youtube':
- video_url = data.get('downloadUrl') or data.get('url')
- return self.url_result(video_url, 'Youtube')
formats = []
for key, format_id in self._QUALITIES:
- format_url = data['sources'].get(key)
+ format_url = data.get(key + 'Url')
if format_url:
formats.append({
'format_id': format_id,
'url': format_url,
})
+ thumbnails = []
+ for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items():
+ if thumbnail_id != 'default':
+ thumbnails.append({
+ 'url': self._IMAGE_HOST + thumbnail_url,
+ 'id': thumbnail_id,
+ })
+
return {
'id': video_id,
'formats': formats,
'title': data['title'],
'description': data.get('description') or data.get('subtitle'),
- 'thumbnail': data.get('thumbnail'),
+ 'thumbnails': thumbnails,
'duration': data.get('duration') or data.get('realDuration'),
'timestamp': int_or_none(data.get('created')),
'view_count': int_or_none(data.get('viewCount')),
+ 'uploader': data.get('channel'),
+ 'tags': data.get('tags'),
}
diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py
index 042bc8d..ebe563e 100644
--- a/youtube_dl/extractor/rte.py
+++ b/youtube_dl/extractor/rte.py
@@ -39,9 +39,14 @@ class RteIE(InfoExtractor):
duration = float_or_none(self._html_search_meta(
'duration', webpage, 'duration', fatal=False), 1000)
- thumbnail_id = self._search_regex(
- r'<meta name="thumbnail" content="uri:irus:(.*?)" />', webpage, 'thumbnail')
- thumbnail = 'http://img.rasset.ie/' + thumbnail_id + '.jpg'
+ thumbnail = None
+ thumbnail_meta = self._html_search_meta('thumbnail', webpage)
+ if thumbnail_meta:
+ thumbnail_id = self._search_regex(
+ r'uri:irus:(.+)', thumbnail_meta,
+ 'thumbnail id', fatal=False)
+ if thumbnail_id:
+ thumbnail = 'http://img.rasset.ie/%s.jpg' % thumbnail_id
feeds_url = self._html_search_meta('feeds-prefix', webpage, 'feeds url') + video_id
json_string = self._download_json(feeds_url, video_id)
@@ -49,6 +54,7 @@ class RteIE(InfoExtractor):
# f4m_url = server + relative_url
f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url']
f4m_formats = self._extract_f4m_formats(f4m_url, video_id)
+ self._sort_formats(f4m_formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py
index 543d944..4d612b5 100644
--- a/youtube_dl/extractor/rtlnl.py
+++ b/youtube_dl/extractor/rtlnl.py
@@ -20,18 +20,19 @@ class RtlNlIE(InfoExtractor):
(?P<id>[0-9a-f-]+)'''
_TESTS = [{
- 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677',
- 'md5': 'cc16baa36a6c169391f0764fa6b16654',
+ 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
+ 'md5': '473d1946c1fdd050b2c0161a4b13c373',
'info_dict': {
- 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677',
+ 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
'ext': 'mp4',
- 'title': 'RTL Nieuws - Laat',
- 'description': 'md5:6b61f66510c8889923b11f2778c72dc5',
- 'timestamp': 1408051800,
- 'upload_date': '20140814',
- 'duration': 576.880,
+ 'title': 'RTL Nieuws',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'timestamp': 1461951000,
+ 'upload_date': '20160429',
+ 'duration': 1167.96,
},
}, {
+ # best format avaialble a3t
'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
'md5': 'dea7474214af1271d91ef332fb8be7ea',
'info_dict': {
@@ -39,18 +40,19 @@ class RtlNlIE(InfoExtractor):
'ext': 'mp4',
'timestamp': 1424039400,
'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
- 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
+ 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
'upload_date': '20150215',
'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
}
}, {
# empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
+ # best format available nettv
'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
'info_dict': {
'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
'ext': 'mp4',
'title': 'RTL Nieuws - Meer beelden van overval juwelier',
- 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
+ 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
'timestamp': 1437233400,
'upload_date': '20150718',
'duration': 30.474,
@@ -94,22 +96,46 @@ class RtlNlIE(InfoExtractor):
videopath = material['videopath']
m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
- formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
video_urlpart = videopath.split('/adaptive/')[1][:-5]
PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
- formats.extend([
- {
- 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart),
- 'format_id': 'pg-sd',
- },
- {
- 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart),
- 'format_id': 'pg-hd',
- 'quality': 0,
+ PG_FORMATS = (
+ ('a2t', 512, 288),
+ ('a3t', 704, 400),
+ ('nettv', 1280, 720),
+ )
+
+ def pg_format(format_id, width, height):
+ return {
+ 'url': PG_URL_TEMPLATE % (format_id, video_urlpart),
+ 'format_id': 'pg-%s' % format_id,
+ 'protocol': 'http',
+ 'width': width,
+ 'height': height,
}
- ])
+
+ if not formats:
+ formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS]
+ else:
+ pg_formats = []
+ for format_id, width, height in PG_FORMATS:
+ try:
+ # Find hls format with the same width and height corresponding
+ # to progressive format and copy metadata from it.
+ f = next(f for f in formats if f.get('height') == height)
+ # hls formats may have invalid width
+ f['width'] = width
+ f_copy = f.copy()
+ f_copy.update(pg_format(format_id, width, height))
+ pg_formats.append(f_copy)
+ except StopIteration:
+ # Missing hls format does mean that no progressive format with
+ # such width and height exists either.
+ pass
+ formats.extend(pg_formats)
self._sort_formats(formats)
thumbnails = []
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index 603d7bd..f11e358 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -6,13 +6,16 @@ import re
import time
from .common import InfoExtractor
+from ..compat import (
+ compat_struct_unpack,
+)
from ..utils import (
ExtractorError,
float_or_none,
remove_end,
+ remove_start,
sanitized_Request,
std_headers,
- struct_unpack,
)
@@ -20,7 +23,7 @@ def _decrypt_url(png):
encrypted_data = base64.b64decode(png.encode('utf-8'))
text_index = encrypted_data.find(b'tEXt')
text_chunk = encrypted_data[text_index - 4:]
- length = struct_unpack('!I', text_chunk[:4])[0]
+ length = compat_struct_unpack('!I', text_chunk[:4])[0]
# Use bytearray to get integers when iterating in both python 2.x and 3.x
data = bytearray(text_chunk[8:8 + length])
data = [chr(b) for b in data if b != 0]
@@ -61,7 +64,7 @@ def _decrypt_url(png):
class RTVEALaCartaIE(InfoExtractor):
IE_NAME = 'rtve.es:alacarta'
IE_DESC = 'RTVE a la carta'
- _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)'
+ _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
@@ -84,6 +87,9 @@ class RTVEALaCartaIE(InfoExtractor):
}, {
'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
'only_matching': True,
+ }, {
+ 'url': 'http://www.rtve.es/filmoteca/no-do/not-1-introduccion-primer-noticiario-espanol/1465256/',
+ 'only_matching': True,
}]
def _real_initialize(self):
@@ -178,14 +184,14 @@ class RTVEInfantilIE(InfoExtractor):
class RTVELiveIE(InfoExtractor):
IE_NAME = 'rtve.es:live'
IE_DESC = 'RTVE.es live streams'
- _VALID_URL = r'http://www\.rtve\.es/(?:deportes/directo|noticias|television)/(?P<id>[a-zA-Z0-9-]+)'
+ _VALID_URL = r'https?://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
_TESTS = [{
- 'url': 'http://www.rtve.es/noticias/directo-la-1/',
+ 'url': 'http://www.rtve.es/directo/la-1/',
'info_dict': {
- 'id': 'directo-la-1',
- 'ext': 'flv',
- 'title': 're:^La 1 de TVE [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+ 'id': 'la-1',
+ 'ext': 'mp4',
+ 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
},
'params': {
'skip_download': 'live stream',
@@ -198,23 +204,21 @@ class RTVELiveIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- player_url = self._search_regex(
- r'<param name="movie" value="([^"]+)"/>', webpage, 'player URL')
- title = remove_end(self._og_search_title(webpage), ' en directo')
+ title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
+ title = remove_start(title, 'Estoy viendo ')
title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
vidplayer_id = self._search_regex(
- r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
+ r'playerId=player([0-9]+)', webpage, 'internal video ID')
+ png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id
png = self._download_webpage(png_url, video_id, 'Downloading url information')
- video_url = _decrypt_url(png)
+ m3u8_url = _decrypt_url(png)
+ formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+ self._sort_formats(formats)
return {
'id': video_id,
- 'ext': 'flv',
'title': title,
- 'url': video_url,
- 'app': 'rtve-live-live?ovpfv=2.1.2',
- 'player_url': player_url,
- 'rtmp_live': True,
+ 'formats': formats,
+ 'is_live': True,
}
diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py
index 7c9d4b0..4896d09 100644
--- a/youtube_dl/extractor/rtvnh.py
+++ b/youtube_dl/extractor/rtvnh.py
@@ -38,6 +38,7 @@ class RTVNHIE(InfoExtractor):
item['file'], video_id, ext='mp4', entry_protocol='m3u8_native'))
elif item.get('type') == '':
formats.append({'url': item['file']})
+ self._sort_formats(formats)
return {
'id': video_id,
diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py
index 0e470e7..1f7c262 100644
--- a/youtube_dl/extractor/ruhd.py
+++ b/youtube_dl/extractor/ruhd.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class RUHDIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)'
_TEST = {
'url': 'http://www.ruhd.ru/play.php?vid=207',
'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83',
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index c5c47d0..9ca4ae1 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -122,7 +122,7 @@ class RutubeEmbedIE(InfoExtractor):
class RutubeChannelIE(InfoExtractor):
IE_NAME = 'rutube:channel'
IE_DESC = 'Rutube channels'
- _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)'
+ _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
_TESTS = [{
'url': 'http://rutube.ru/tags/video/1800/',
'info_dict': {
@@ -156,7 +156,7 @@ class RutubeChannelIE(InfoExtractor):
class RutubeMovieIE(RutubeChannelIE):
IE_NAME = 'rutube:movie'
IE_DESC = 'Rutube movies'
- _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)'
+ _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)'
_TESTS = []
_MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json'
@@ -174,7 +174,7 @@ class RutubeMovieIE(RutubeChannelIE):
class RutubePersonIE(RutubeChannelIE):
IE_NAME = 'rutube:person'
IE_DESC = 'Rutube person videos'
- _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)'
+ _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)'
_TESTS = [{
'url': 'http://rutube.ru/video/person/313878/',
'info_dict': {
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
index f7fe1fe..a2379eb 100644
--- a/youtube_dl/extractor/rutv.py
+++ b/youtube_dl/extractor/rutv.py
@@ -14,7 +14,7 @@ class RUTVIE(InfoExtractor):
IE_DESC = 'RUTV.RU'
_VALID_URL = r'''(?x)
https?://player\.(?:rutv\.ru|vgtrk\.com)/
- (?P<path>flash2v/container\.swf\?id=
+ (?P<path>flash\d+v/container\.swf\?id=
|iframe/(?P<type>swf|video|live)/id/
|index/iframe/cast_id/)
(?P<id>\d+)'''
@@ -109,7 +109,7 @@ class RUTVIE(InfoExtractor):
return mobj.group('url')
mobj = re.search(
- r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)',
+ r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)',
webpage)
if mobj:
return mobj.group('url')
@@ -119,7 +119,7 @@ class RUTVIE(InfoExtractor):
video_id = mobj.group('id')
video_path = mobj.group('path')
- if video_path.startswith('flash2v'):
+ if re.match(r'flash\d+v', video_path):
video_type = 'video'
elif video_path.startswith('iframe'):
video_type = mobj.group('type')
@@ -168,7 +168,7 @@ class RUTVIE(InfoExtractor):
'play_path': mobj.group('playpath'),
'app': mobj.group('app'),
'page_url': 'http://player.rutv.ru',
- 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
+ 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22',
'rtmp_live': True,
'ext': 'flv',
'vbr': int(quality),
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
index 7de7b72..6ba91f2 100644
--- a/youtube_dl/extractor/safari.py
+++ b/youtube_dl/extractor/safari.py
@@ -4,14 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
from ..utils import (
ExtractorError,
sanitized_Request,
- smuggle_url,
std_headers,
urlencode_postdata,
+ update_url_query,
)
@@ -20,28 +19,30 @@ class SafariBaseIE(InfoExtractor):
_SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
_NETRC_MACHINE = 'safari'
- _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+ _API_BASE = 'https://www.safaribooksonline.com/api/v1'
_API_FORMAT = 'json'
LOGGED_IN = False
def _real_initialize(self):
- # We only need to log in once for courses or individual videos
- if not self.LOGGED_IN:
- self._login()
- SafariBaseIE.LOGGED_IN = True
+ self._login()
def _login(self):
+ # We only need to log in once for courses or individual videos
+ if self.LOGGED_IN:
+ return
+
(username, password) = self._get_login_info()
if username is None:
- self.raise_login_required('safaribooksonline.com account is required')
+ return
- headers = std_headers
+ headers = std_headers.copy()
if 'Referer' not in headers:
headers['Referer'] = self._LOGIN_URL
+ login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)
login_page = self._download_webpage(
- self._LOGIN_URL, None,
+ login_page_request, None,
'Downloading login form')
csrf = self._html_search_regex(
@@ -66,35 +67,27 @@ class SafariBaseIE(InfoExtractor):
'Login failed; make sure your credentials are correct and try again.',
expected=True)
+ SafariBaseIE.LOGGED_IN = True
+
self.to_screen('Login successful')
class SafariIE(SafariBaseIE):
IE_NAME = 'safari'
IE_DESC = 'safaribooksonline.com online video'
- _VALID_URL = r'''(?x)https?://
- (?:www\.)?safaribooksonline\.com/
- (?:
- library/view/[^/]+|
- api/v1/book
- )/
- (?P<course_id>[^/]+)/
- (?:chapter(?:-content)?/)?
- (?P<part>part\d+)\.html
- '''
+ _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html'
_TESTS = [{
'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
- 'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+ 'md5': 'dcc5a425e79f2564148652616af1f2a3',
'info_dict': {
- 'id': '2842601850001',
+ 'id': '0_qbqx90ic',
'ext': 'mp4',
- 'title': 'Introduction',
+ 'title': 'Introduction to Hadoop Fundamentals LiveLessons',
+ 'timestamp': 1437758058,
+ 'upload_date': '20150724',
+ 'uploader_id': 'stork',
},
- 'skip': 'Requires safaribooksonline account credentials',
- }, {
- 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
- 'only_matching': True,
}, {
# non-digits in course id
'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
@@ -103,18 +96,55 @@ class SafariIE(SafariBaseIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- course_id = mobj.group('course_id')
- part = mobj.group('part')
+ video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part'))
+
+ webpage = self._download_webpage(url, video_id)
+ reference_id = self._search_regex(
+ r'data-reference-id=(["\'])(?P<id>.+?)\1',
+ webpage, 'kaltura reference id', group='id')
+ partner_id = self._search_regex(
+ r'data-partner-id=(["\'])(?P<id>.+?)\1',
+ webpage, 'kaltura widget id', group='id')
+ ui_id = self._search_regex(
+ r'data-ui-id=(["\'])(?P<id>.+?)\1',
+ webpage, 'kaltura uiconf id', group='id')
+
+ query = {
+ 'wid': '_%s' % partner_id,
+ 'uiconf_id': ui_id,
+ 'flashvars[referenceId]': reference_id,
+ }
+
+ if self.LOGGED_IN:
+ kaltura_session = self._download_json(
+ '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id),
+ video_id, 'Downloading kaltura session JSON',
+ 'Unable to download kaltura session JSON', fatal=False)
+ if kaltura_session:
+ session = kaltura_session.get('session')
+ if session:
+ query['flashvars[ks]'] = session
+
+ return self.url_result(update_url_query(
+ 'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query),
+ 'Kaltura')
+
- webpage = self._download_webpage(
- '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
- part)
+class SafariApiIE(SafariBaseIE):
+ IE_NAME = 'safari:api'
+ _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html'
- bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if not bc_url:
- raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+ _TEST = {
+ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
+ 'only_matching': True,
+ }
- return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy')
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ part = self._download_json(
+ url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')),
+ 'Downloading part JSON')
+ return self.url_result(part['web_url'], SafariIE.ie_key())
class SafariCourseIE(SafariBaseIE):
@@ -140,7 +170,7 @@ class SafariCourseIE(SafariBaseIE):
course_id = self._match_id(url)
course_json = self._download_json(
- '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+ '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
course_id, 'Downloading course JSON')
if 'chapters' not in course_json:
@@ -148,7 +178,7 @@ class SafariCourseIE(SafariBaseIE):
'No chapters found for course %s' % course_id, expected=True)
entries = [
- self.url_result(chapter, 'Safari')
+ self.url_result(chapter, SafariApiIE.ie_key())
for chapter in course_json['chapters']]
course_title = course_json['title']
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index d6ee2d9..96472fb 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -2,6 +2,10 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..utils import (
+ smuggle_url,
+ ExtractorError,
+)
class SBSIE(InfoExtractor):
@@ -20,6 +24,9 @@ class SBSIE(InfoExtractor):
'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
'thumbnail': 're:http://.*\.jpg',
'duration': 308,
+ 'timestamp': 1408613220,
+ 'upload_date': '20140821',
+ 'uploader': 'SBSC',
},
}, {
'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed',
@@ -31,21 +38,29 @@ class SBSIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ player_params = self._download_json(
+ 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id)
- webpage = self._download_webpage(
- 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id)
-
- player_params = self._parse_json(
- self._search_regex(
- r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'),
- video_id)
+ error = player_params.get('error')
+ if error:
+ error_message = 'Sorry, The video you are looking for does not exist.'
+ video_data = error.get('results') or {}
+ error_code = error.get('errorCode')
+ if error_code == 'ComingSoon':
+ error_message = '%s is not yet available.' % video_data.get('title', '')
+ elif error_code in ('Forbidden', 'intranetAccessOnly'):
+ error_message = 'Sorry, This video cannot be accessed via this website'
+ elif error_code == 'Expired':
+ error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '')
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
urls = player_params['releaseUrls']
- theplatform_url = (urls.get('progressive') or urls.get('standard') or
- urls.get('html') or player_params['relatedItemsURL'])
+ theplatform_url = (urls.get('progressive') or urls.get('html') or
+ urls.get('standard') or player_params['relatedItemsURL'])
return {
'_type': 'url_transparent',
+ 'ie_key': 'ThePlatform',
'id': video_id,
- 'url': theplatform_url,
+ 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}),
}
diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py
index 3bf93c8..b1ca12f 100644
--- a/youtube_dl/extractor/scivee.py
+++ b/youtube_dl/extractor/scivee.py
@@ -18,6 +18,7 @@ class SciVeeIE(InfoExtractor):
'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting',
'description': 'md5:81f1710638e11a481358fab1b11059d7',
},
+ 'skip': 'Not accessible from Travis CI server',
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py
index dfd897b..3566317 100644
--- a/youtube_dl/extractor/screencast.py
+++ b/youtube_dl/extractor/screencast.py
@@ -12,7 +12,7 @@ from ..utils import (
class ScreencastIE(InfoExtractor):
- _VALID_URL = r'https?://www\.screencast\.com/t/(?P<id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://www.screencast.com/t/3ZEjQXlT',
'md5': '917df1c13798a3e96211dd1561fded83',
@@ -53,8 +53,10 @@ class ScreencastIE(InfoExtractor):
'description': 'md5:7b9f393bc92af02326a5c5889639eab0',
'thumbnail': 're:^https?://.*\.(?:gif|jpg)$',
}
- },
- ]
+ }, {
+ 'url': 'http://screencast.com/t/aAB3iowa',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -94,8 +96,9 @@ class ScreencastIE(InfoExtractor):
title = self._og_search_title(webpage, default=None)
if title is None:
title = self._html_search_regex(
- [r'<b>Title:</b> ([^<]*)</div>',
- r'class="tabSeperator">></span><span class="tabText">(.*?)<'],
+ [r'<b>Title:</b> ([^<]+)</div>',
+ r'class="tabSeperator">></span><span class="tabText">(.+?)<',
+ r'<title>([^<]+)</title>'],
webpage, 'title')
thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description(webpage, default=None)
diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py
index 0533742..7a88a42 100644
--- a/youtube_dl/extractor/screencastomatic.py
+++ b/youtube_dl/extractor/screencastomatic.py
@@ -1,15 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..compat import compat_urlparse
-from ..utils import (
- ExtractorError,
- js_to_json,
-)
+from .jwplatform import JWPlatformBaseIE
+from ..utils import js_to_json
-class ScreencastOMaticIE(InfoExtractor):
+class ScreencastOMaticIE(JWPlatformBaseIE):
_VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
_TEST = {
'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
@@ -20,6 +16,7 @@ class ScreencastOMaticIE(InfoExtractor):
'title': 'Welcome to 3-4 Philosophy @ DECV!',
'thumbnail': 're:^https?://.*\.jpg$',
'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.',
+ 'duration': 369.163,
}
}
@@ -27,23 +24,14 @@ class ScreencastOMaticIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- setup_js = self._search_regex(
- r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);",
- webpage, 'setup code')
- data = self._parse_json(setup_js, video_id, transform_source=js_to_json)
- try:
- video_data = next(
- m for m in data['modes'] if m.get('type') == 'html5')
- except StopIteration:
- raise ExtractorError('Could not find any video entries!')
- video_url = compat_urlparse.urljoin(url, video_data['config']['file'])
- thumbnail = data.get('image')
+ jwplayer_data = self._parse_json(
+ self._search_regex(
+ r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'),
+ video_id, transform_source=js_to_json)
- return {
- 'id': video_id,
+ info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+ info_dict.update({
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
- 'url': video_url,
- 'ext': 'mp4',
- 'thumbnail': thumbnail,
- }
+ })
+ return info_dict
diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py
index f2af15f..dd0a6ba 100644
--- a/youtube_dl/extractor/screenjunkies.py
+++ b/youtube_dl/extractor/screenjunkies.py
@@ -11,7 +11,7 @@ from ..utils import (
class ScreenJunkiesIE(InfoExtractor):
- _VALID_URL = r'http://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)'
+ _VALID_URL = r'https?://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)'
_TESTS = [{
'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915',
'md5': '5c2b686bec3d43de42bde9ec047536b0',
diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py
index 2cf210e..40333c8 100644
--- a/youtube_dl/extractor/screenwavemedia.py
+++ b/youtube_dl/extractor/screenwavemedia.py
@@ -12,7 +12,7 @@ from ..utils import (
class ScreenwaveMediaIE(InfoExtractor):
- _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)'
+ _VALID_URL = r'(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)'
EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1'
_TESTS = [{
'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',
@@ -70,25 +70,27 @@ class ScreenwaveMediaIE(InfoExtractor):
formats = []
for source in sources:
- if source['type'] == 'hls':
- formats.extend(self._extract_m3u8_formats(source['file'], video_id, ext='mp4'))
+ file_ = source.get('file')
+ if not file_:
+ continue
+ if source.get('type') == 'hls':
+ formats.extend(self._extract_m3u8_formats(file_, video_id, ext='mp4'))
else:
- file_ = source.get('file')
- if not file_:
- continue
- format_label = source.get('label')
format_id = self._search_regex(
r'_(.+?)\.[^.]+$', file_, 'format id', default=None)
+ if not self._is_valid_url(file_, video_id, format_id or 'video'):
+ continue
+ format_label = source.get('label')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]', format_label, 'height', default=None))
formats.append({
- 'url': source['file'],
+ 'url': file_,
'format_id': format_id,
'format': format_label,
'ext': source.get('type'),
'height': height,
})
- self._sort_formats(formats)
+ self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py
new file mode 100644
index 0000000..3b9c65e
--- /dev/null
+++ b/youtube_dl/extractor/seeker.py
@@ -0,0 +1,57 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class SeekerIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html'
+ _TESTS = [{
+ # player.loadRevision3Item
+ 'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html',
+ 'md5': '30c1dc4030cc715cf05b423d0947ac18',
+ 'info_dict': {
+ 'id': '76243',
+ 'ext': 'webm',
+ 'title': 'Should Trump Be Required To Release His Tax Returns?',
+ 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?',
+ 'uploader': 'Seeker Daily',
+ 'uploader_id': 'seekerdaily',
+ }
+ }, {
+ 'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html',
+ 'playlist': [
+ {
+ 'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+ 'info_dict': {
+ 'id': '67558',
+ 'ext': 'mp4',
+ 'title': 'The Pros & Cons Of Zoos',
+ 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
+ 'uploader': 'DNews',
+ 'uploader_id': 'dnews',
+ },
+ }
+ ],
+ 'info_dict': {
+ 'id': '1834116536',
+ 'title': 'After Gorilla Killing, Changes Ahead for Zoos',
+ 'description': 'The largest association of zoos and others are hoping to learn from recent incidents that led to the shooting deaths of a gorilla and two lions.',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id, article_id = re.match(self._VALID_URL, url).groups()
+ webpage = self._download_webpage(url, display_id)
+ mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage)
+ if mobj:
+ playlist_type, playlist_id = mobj.groups()
+ return self.url_result(
+ 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id)
+ else:
+ entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall(
+ r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)]
+ return self.playlist_result(
+ entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py
index 4d3b585..c5f474d 100644
--- a/youtube_dl/extractor/senateisvp.py
+++ b/youtube_dl/extractor/senateisvp.py
@@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor):
['arch', '', 'http://ussenate-f.akamaihd.net/']
]
_IE_NAME = 'senate.gov'
- _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
+ _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P<qs>.+)'
_TESTS = [{
'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png',
'info_dict': {
diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py
new file mode 100644
index 0000000..1c636f6
--- /dev/null
+++ b/youtube_dl/extractor/sendtonews.py
@@ -0,0 +1,86 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .jwplatform import JWPlatformBaseIE
+from ..compat import compat_parse_qs
+from ..utils import (
+ ExtractorError,
+ parse_duration,
+)
+
+
+class SendtoNewsIE(JWPlatformBaseIE):
+ _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)'
+
+ _TEST = {
+ # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/
+ 'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes',
+ 'info_dict': {
+ 'id': 'GxfCe0Zo7D-175909-5588',
+ 'ext': 'mp4',
+ 'title': 'Recap: CLE 15, CIN 6',
+ 'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
+ 'duration': 49,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s'
+
+ @classmethod
+ def _extract_url(cls, webpage):
+ mobj = re.search(r'''(?x)<script[^>]+src=([\'"])
+ (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\?
+ .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
+ \1>''', webpage)
+ if mobj:
+ sk, mk, pk = mobj.group('SC').split('-')
+ return cls._URL_TEMPLATE % (sk, mk, pk)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ params = compat_parse_qs(mobj.group('query'))
+
+ if 'SK' not in params or 'MK' not in params or 'PK' not in params:
+ raise ExtractorError('Invalid URL', expected=True)
+
+ video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]])
+
+ webpage = self._download_webpage(url, video_id)
+
+ jwplayer_data_str = self._search_regex(
+ r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data')
+ js_vars = {
+ 'w': 1024,
+ 'h': 768,
+ 'modeVar': 'html5',
+ }
+ for name, val in js_vars.items():
+ js_val = '%d' % val if isinstance(val, int) else '"%s"' % val
+ jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val)
+
+ info_dict = self._parse_jwplayer_data(
+ self._parse_json(jwplayer_data_str, video_id),
+ video_id, require_title=False, rtmp_params={'no_resume': True})
+
+ title = self._html_search_regex(
+ r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title')
+ description = self._html_search_regex(
+ r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage,
+ 'description', fatal=False)
+ duration = parse_duration(self._html_search_regex(
+ r'<div[^>]+class="embedDetails">([0-9:]+)', webpage,
+ 'duration', fatal=False))
+
+ info_dict.update({
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py
index 6365a87..a99b2a8 100644
--- a/youtube_dl/extractor/sexu.py
+++ b/youtube_dl/extractor/sexu.py
@@ -1,7 +1,5 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
@@ -14,7 +12,7 @@ class SexuIE(InfoExtractor):
'id': '961791',
'ext': 'mp4',
'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b',
- 'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54',
+ 'description': 'md5:2b75327061310a3afb3fbd7d09e2e403',
'categories': list, # NSFW
'thumbnail': 're:https?://.*\.jpg$',
'age_limit': 18,
@@ -25,13 +23,18 @@ class SexuIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- quality_arr = self._search_regex(
- r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string')
+ jwvideo = self._parse_json(
+ self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'),
+ video_id)
+
+ sources = jwvideo['sources']
+
formats = [{
- 'url': fmt[0].replace('\\', ''),
- 'format_id': fmt[1],
- 'height': int(fmt[1][:3]),
- } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)]
+ 'url': source['file'].replace('\\', ''),
+ 'format_id': source.get('label'),
+ 'height': self._search_regex(
+ r'^(\d+)[pP]', source.get('label', ''), 'height', default=None),
+ } for source in sources if source.get('file')]
self._sort_formats(formats)
title = self._html_search_regex(
@@ -40,9 +43,7 @@ class SexuIE(InfoExtractor):
description = self._html_search_meta(
'description', webpage, 'description')
- thumbnail = self._html_search_regex(
- r'image:\s*"([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ thumbnail = jwvideo.get('image')
categories_str = self._html_search_meta(
'keywords', webpage, 'categories')
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
index 1178b7a..d95ea06 100644
--- a/youtube_dl/extractor/shahid.py
+++ b/youtube_dl/extractor/shahid.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
int_or_none,
@@ -77,11 +77,12 @@ class ShahidIE(InfoExtractor):
raise ExtractorError('This video is DRM protected.', expected=True)
formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4')
+ self._sort_formats(formats)
video = self._download_json(
'%s/%s/%s?%s' % (
api_vars['url'], api_vars['playerType'], api_vars['id'],
- compat_urllib_parse.urlencode({
+ compat_urllib_parse_urlencode({
'apiKey': 'sh@hid0nlin3',
'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
})),
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index 8eda3c8..e7e5f65 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -3,17 +3,17 @@ from __future__ import unicode_literals
import base64
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
sanitized_Request,
+ urlencode_postdata,
)
class SharedIE(InfoExtractor):
IE_DESC = 'shared.sx and vivo.sx'
- _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})'
+ _VALID_URL = r'https?://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})'
_TESTS = [{
'url': 'http://shared.sx/0060718775',
@@ -45,7 +45,7 @@ class SharedIE(InfoExtractor):
download_form = self._hidden_inputs(webpage)
request = sanitized_Request(
- url, compat_urllib_parse.urlencode(download_form))
+ url, urlencode_postdata(download_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
video_page = self._download_webpage(
diff --git a/youtube_dl/extractor/sharesix.py b/youtube_dl/extractor/sharesix.py
index f1ea9bd..9cce5ce 100644
--- a/youtube_dl/extractor/sharesix.py
+++ b/youtube_dl/extractor/sharesix.py
@@ -4,10 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
parse_duration,
sanitized_Request,
+ urlencode_postdata,
)
@@ -47,7 +47,7 @@ class ShareSixIE(InfoExtractor):
fields = {
'method_free': 'Free'
}
- post = compat_urllib_parse.urlencode(fields)
+ post = urlencode_postdata(fields)
req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py
index b2258a0..8fc6673 100644
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -4,28 +4,35 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
-from ..utils import sanitized_Request
+from ..utils import (
+ HEADRequest,
+ ExtractorError,
+ int_or_none,
+ update_url_query,
+ qualities,
+ get_element_by_attribute,
+ clean_html,
+)
class SinaIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/
- (
- (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-))))
- |
+ _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/
+ (?:
+ (?:view/|.*\#)(?P<video_id>\d+)|
+ .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)|
# This is used by external sites like Weibo
- (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf)
+ api/sinawebApi/outplay.php/(?P<token>.+?)\.swf
)
'''
_TESTS = [
{
- 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898',
- 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f',
+ 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622',
+ 'md5': 'd38433e2fc886007729735650ae4b3e9',
'info_dict': {
- 'id': '110028898',
- 'ext': 'flv',
- 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员',
+ 'id': '250576622',
+ 'ext': 'mp4',
+ 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名',
}
},
{
@@ -35,37 +42,74 @@ class SinaIE(InfoExtractor):
'ext': 'flv',
'title': '军方提高对朝情报监视级别',
},
+ 'skip': 'the page does not exist or has been deleted',
+ },
+ {
+ 'url': 'http://video.sina.com.cn/view/250587748.html',
+ 'md5': '3d1807a25c775092aab3bc157fff49b4',
+ 'info_dict': {
+ 'id': '250587748',
+ 'ext': 'mp4',
+ 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光',
+ },
},
]
- def _extract_video(self, video_id):
- data = compat_urllib_parse.urlencode({'vid': video_id})
- url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
- video_id, 'Downloading video url')
- image_page = self._download_webpage(
- 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
- video_id, 'Downloading thumbnail info')
-
- return {'id': video_id,
- 'url': url_doc.find('./durl/url').text,
- 'ext': 'flv',
- 'title': url_doc.find('./vname').text,
- 'thumbnail': image_page.split('=')[1],
- }
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- if mobj.group('token') is not None:
- # The video id is in the redirected url
- self.to_screen('Getting video id')
- request = sanitized_Request(url)
- request.get_method = lambda: 'HEAD'
- (_, urlh) = self._download_webpage_handle(request, 'NA', False)
- return self._real_extract(urlh.geturl())
- elif video_id is None:
- pseudo_id = mobj.group('pseudo_id')
- webpage = self._download_webpage(url, pseudo_id)
- video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id')
- return self._extract_video(video_id)
+ video_id = mobj.group('video_id')
+ if not video_id:
+ if mobj.group('token') is not None:
+ # The video id is in the redirected url
+ self.to_screen('Getting video id')
+ request = HEADRequest(url)
+ (_, urlh) = self._download_webpage_handle(request, 'NA', False)
+ return self._real_extract(urlh.geturl())
+ else:
+ pseudo_id = mobj.group('pseudo_id')
+ webpage = self._download_webpage(url, pseudo_id)
+ error = get_element_by_attribute('class', 'errtitle', webpage)
+ if error:
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, clean_html(error)), expected=True)
+ video_id = self._search_regex(
+ r"video_id\s*:\s*'(\d+)'", webpage, 'video id')
+
+ video_data = self._download_json(
+ 'http://s.video.sina.com.cn/video/h5play',
+ video_id, query={'video_id': video_id})
+ if video_data['code'] != 1:
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, video_data['message']), expected=True)
+ else:
+ video_data = video_data['data']
+ title = video_data['title']
+ description = video_data.get('description')
+ if description:
+ description = description.strip()
+
+ preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd'])
+ formats = []
+ for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items():
+ file_api = quality.get('file_api')
+ file_id = quality.get('file_id')
+ if not file_api or not file_id:
+ continue
+ formats.append({
+ 'format_id': quality_id,
+ 'url': update_url_query(file_api, {'vid': file_id}),
+ 'preference': preference(quality_id),
+ 'ext': 'mp4',
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': video_data.get('image'),
+ 'duration': int_or_none(video_data.get('length')),
+ 'timestamp': int_or_none(video_data.get('create_time')),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py
index 015ef75..5c3fd0f 100644
--- a/youtube_dl/extractor/smotri.py
+++ b/youtube_dl/extractor/smotri.py
@@ -7,12 +7,12 @@ import hashlib
import uuid
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
sanitized_Request,
unified_strdate,
+ urlencode_postdata,
)
@@ -175,7 +175,7 @@ class SmotriIE(InfoExtractor):
video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
request = sanitized_Request(
- 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form))
+ 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
video = self._download_json(request, video_id, 'Downloading video JSON')
@@ -338,7 +338,7 @@ class SmotriBroadcastIE(InfoExtractor):
}
request = sanitized_Request(
- broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form))
+ broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form))
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
broadcast_page = self._download_webpage(
request, broadcast_id, 'Logging in and confirming age')
diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py
index ea8fc25..49e5d09 100644
--- a/youtube_dl/extractor/sohu.py
+++ b/youtube_dl/extractor/sohu.py
@@ -6,7 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import (
compat_str,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
@@ -170,7 +170,7 @@ class SohuIE(InfoExtractor):
if retries > 0:
download_note += ' (retry #%d)' % retries
part_info = self._parse_json(self._download_webpage(
- 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)),
+ 'http://%s/?%s' % (allot, compat_urllib_parse_urlencode(params)),
video_id, download_note), video_id)
video_url = part_info['url']
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 1efb2b9..194dabc 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -11,10 +11,9 @@ from .common import (
from ..compat import (
compat_str,
compat_urlparse,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
- encode_dict,
ExtractorError,
int_or_none,
unified_strdate,
@@ -393,7 +392,7 @@ class SoundcloudUserIE(SoundcloudIE):
query = COMMON_QUERY.copy()
query['offset'] = 0
- next_href = base_url + '?' + compat_urllib_parse.urlencode(query)
+ next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
entries = []
for i in itertools.count():
@@ -424,7 +423,7 @@ class SoundcloudUserIE(SoundcloudIE):
qs = compat_urlparse.parse_qs(parsed_next_href.query)
qs.update(COMMON_QUERY)
next_href = compat_urlparse.urlunparse(
- parsed_next_href._replace(query=compat_urllib_parse.urlencode(qs, True)))
+ parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
return {
'_type': 'playlist',
@@ -460,7 +459,7 @@ class SoundcloudPlaylistIE(SoundcloudIE):
if token:
data_dict['secret_token'] = token
- data = compat_urllib_parse.urlencode(data_dict)
+ data = compat_urllib_parse_urlencode(data_dict)
data = self._download_json(
base_url + data, playlist_id, 'Downloading playlist')
@@ -500,7 +499,7 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
query['client_id'] = self._CLIENT_ID
query['linked_partitioning'] = '1'
query['offset'] = 0
- data = compat_urllib_parse.urlencode(encode_dict(query))
+ data = compat_urllib_parse_urlencode(query)
next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data)
collected_results = 0
diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py
deleted file mode 100644
index ebb5d6e..0000000
--- a/youtube_dl/extractor/space.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from .brightcove import BrightcoveLegacyIE
-from ..utils import RegexNotFoundError, ExtractorError
-
-
-class SpaceIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
- _TEST = {
- 'add_ie': ['BrightcoveLegacy'],
- 'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
- 'info_dict': {
- 'id': '2780937028001',
- 'ext': 'mp4',
- 'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
- 'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
- 'uploader': 'TechMedia Networks',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- try:
- # Some videos require the playerKey field, which isn't define in
- # the BrightcoveExperience object
- brightcove_url = self._og_search_video_url(webpage)
- except RegexNotFoundError:
- # Other videos works fine with the info from the object
- brightcove_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if brightcove_url is None:
- raise ExtractorError(
- 'The webpage does not contain a video', expected=True)
- return self.url_result(brightcove_url, BrightcoveLegacyIE.ie_key())
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py
index 692fd78..92a7120 100644
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -96,20 +96,18 @@ class SpankwireIE(InfoExtractor):
formats = []
for height, video_url in zip(heights, video_urls):
path = compat_urllib_parse_urlparse(video_url).path
- _, quality = path.split('/')[4].split('_')[:2]
- f = {
+ m = re.search(r'/(?P<height>\d+)[pP]_(?P<tbr>\d+)[kK]', path)
+ if m:
+ tbr = int(m.group('tbr'))
+ height = int(m.group('height'))
+ else:
+ tbr = None
+ formats.append({
'url': video_url,
+ 'format_id': '%dp' % height,
'height': height,
- }
- tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None)
- if tbr:
- f.update({
- 'tbr': int(tbr),
- 'format_id': '%dp' % height,
- })
- else:
- f['format_id'] = quality
- formats.append(f)
+ 'tbr': tbr,
+ })
self._sort_formats(formats)
age_limit = self._rta_search(webpage)
diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py
index dfe50ed..7e67833 100644
--- a/youtube_dl/extractor/sport5.py
+++ b/youtube_dl/extractor/sport5.py
@@ -8,7 +8,7 @@ from ..utils import ExtractorError
class Sport5IE(InfoExtractor):
- _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)'
_TESTS = [
{
'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1',
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index 86d509a..e5c28ae 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
+ js_to_json,
unified_strdate,
)
@@ -94,18 +95,32 @@ class SportBoxEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- hls = self._search_regex(
- r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]",
- webpage, 'hls file')
+ formats = []
- formats = self._extract_m3u8_formats(hls, video_id, 'mp4')
+ def cleanup_js(code):
+ # desktop_advert_config contains complex Javascripts and we don't need it
+ return js_to_json(re.sub(r'desktop_advert_config.*', '', code))
- title = self._search_regex(
- r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title')
+ jwplayer_data = self._parse_json(self._search_regex(
+ r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id,
+ transform_source=cleanup_js)
- thumbnail = self._search_regex(
- r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"',
- webpage, 'thumbnail', default=None)
+ hls_url = jwplayer_data.get('hls_url')
+ if hls_url:
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, ext='mp4', m3u8_id='hls'))
+
+ rtsp_url = jwplayer_data.get('rtsp_url')
+ if rtsp_url:
+ formats.append({
+ 'url': rtsp_url,
+ 'format_id': 'rtsp',
+ })
+
+ self._sort_formats(formats)
+
+ title = jwplayer_data['node_title']
+ thumbnail = jwplayer_data.get('image_url')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py
new file mode 100644
index 0000000..0d7925a
--- /dev/null
+++ b/youtube_dl/extractor/sportschau.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .wdr import WDRBaseIE
+from ..utils import get_element_by_attribute
+
+
+class SportschauIE(WDRBaseIE):
+ IE_NAME = 'Sportschau'
+ _VALID_URL = r'https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video-?(?P<id>[^/#?]+)\.html'
+ _TEST = {
+ 'url': 'http://www.sportschau.de/uefaeuro2016/videos/video-dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100.html',
+ 'info_dict': {
+ 'id': 'mdb-1140188',
+ 'display_id': 'dfb-team-geht-gut-gelaunt-ins-spiel-gegen-polen-100',
+ 'ext': 'mp4',
+ 'title': 'DFB-Team geht gut gelaunt ins Spiel gegen Polen',
+ 'description': 'Vor dem zweiten Gruppenspiel gegen Polen herrscht gute Stimmung im deutschen Team. Insbesondere Bastian Schweinsteiger strotzt vor Optimismus nach seinem Tor gegen die Ukraine.',
+ 'upload_date': '20160615',
+ },
+ 'skip': 'Geo-restricted to Germany',
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = get_element_by_attribute('class', 'headline', webpage)
+ description = self._html_search_meta('description', webpage, 'description')
+
+ info = self._extract_wdr_video(webpage, video_id)
+
+ info.update({
+ 'title': title,
+ 'description': description,
+ })
+
+ return info
diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py
index 13101c7..54d1843 100644
--- a/youtube_dl/extractor/ssa.py
+++ b/youtube_dl/extractor/ssa.py
@@ -8,7 +8,7 @@ from ..utils import (
class SSAIE(InfoExtractor):
- _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)'
+ _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P<id>\d+)'
_TEST = {
'url': 'http://ssa.nls.uk/film/3561',
'info_dict': {
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py
index 77841b9..6a6bb90 100644
--- a/youtube_dl/extractor/streamcloud.py
+++ b/youtube_dl/extractor/streamcloud.py
@@ -4,15 +4,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
-from ..utils import sanitized_Request
+from ..utils import (
+ ExtractorError,
+ urlencode_postdata,
+)
class StreamcloudIE(InfoExtractor):
IE_NAME = 'streamcloud.eu'
_VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)(?:/(?P<fname>[^#?]*)\.html)?'
- _TEST = {
+ _TESTS = [{
'url': 'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
'md5': '6bea4c7fa5daaacc2a946b7146286686',
'info_dict': {
@@ -21,7 +23,10 @@ class StreamcloudIE(InfoExtractor):
'title': 'youtube-dl test video \'/\\ ä ↭',
},
'skip': 'Only available from the EU'
- }
+ }, {
+ 'url': 'http://streamcloud.eu/ua8cmfh1nbe6/NSHIP-148--KUC-NG--H264-.mp4.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -29,26 +34,36 @@ class StreamcloudIE(InfoExtractor):
orig_webpage = self._download_webpage(url, video_id)
+ if '>File Not Found<' in orig_webpage:
+ raise ExtractorError(
+ 'Video %s does not exist' % video_id, expected=True)
+
fields = re.findall(r'''(?x)<input\s+
type="(?:hidden|submit)"\s+
name="([^"]+)"\s+
(?:id="[^"]+"\s+)?
value="([^"]*)"
''', orig_webpage)
- post = compat_urllib_parse.urlencode(fields)
self._sleep(12, video_id)
- headers = {
- b'Content-Type': b'application/x-www-form-urlencoded',
- }
- req = sanitized_Request(url, post, headers)
webpage = self._download_webpage(
- req, video_id, note='Downloading video page ...')
- title = self._html_search_regex(
- r'<h1[^>]*>([^<]+)<', webpage, 'title')
- video_url = self._search_regex(
- r'file:\s*"([^"]+)"', webpage, 'video URL')
+ url, video_id, data=urlencode_postdata(fields), headers={
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ })
+
+ try:
+ title = self._html_search_regex(
+ r'<h1[^>]*>([^<]+)<', webpage, 'title')
+ video_url = self._search_regex(
+ r'file:\s*"([^"]+)"', webpage, 'video URL')
+ except ExtractorError:
+ message = self._html_search_regex(
+ r'(?s)<div[^>]+class=(["\']).*?msgboxinfo.*?\1[^>]*>(?P<message>.+?)</div>',
+ webpage, 'message', default=None, group='message')
+ if message:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
+ raise
thumbnail = self._search_regex(
r'image:\s*"([^"]+)"', webpage, 'thumbnail URL', fatal=False)
diff --git a/youtube_dl/extractor/streetvoice.py b/youtube_dl/extractor/streetvoice.py
index 6a57fa6..e529051 100644
--- a/youtube_dl/extractor/streetvoice.py
+++ b/youtube_dl/extractor/streetvoice.py
@@ -14,7 +14,6 @@ class StreetVoiceIE(InfoExtractor):
'info_dict': {
'id': '94440',
'ext': 'mp3',
- 'filesize': 4167053,
'title': '輸',
'description': 'Crispy脆樂團 - 輸',
'thumbnail': 're:^https?://.*\.jpg$',
@@ -32,20 +31,19 @@ class StreetVoiceIE(InfoExtractor):
song_id = self._match_id(url)
song = self._download_json(
- 'http://streetvoice.com/music/api/song/%s' % song_id, song_id)
+ 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'')
title = song['name']
- author = song['musician']['name']
+ author = song['user']['nickname']
return {
'id': song_id,
'url': song['file'],
- 'filesize': song.get('size'),
'title': title,
'description': '%s - %s' % (author, title),
'thumbnail': self._proto_relative_url(song.get('image'), 'http:'),
'duration': song.get('length'),
'upload_date': unified_strdate(song.get('created_at')),
'uploader': author,
- 'uploader_id': compat_str(song['musician']['id']),
+ 'uploader_id': compat_str(song['user']['id']),
}
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
index 399c3b8..67f56fa 100644
--- a/youtube_dl/extractor/svt.py
+++ b/youtube_dl/extractor/svt.py
@@ -6,56 +6,80 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
+ dict_get,
+ int_or_none,
+ try_get,
)
class SVTBaseIE(InfoExtractor):
- def _extract_video(self, url, video_id):
- info = self._download_json(url, video_id)
-
- title = info['context']['title']
- thumbnail = info['context'].get('thumbnailImage')
-
- video_info = info['video']
+ def _extract_video(self, video_info, video_id):
formats = []
for vr in video_info['videoReferences']:
+ player_type = vr.get('playerType')
vurl = vr['url']
ext = determine_ext(vurl)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
vurl, video_id,
ext='mp4', entry_protocol='m3u8_native',
- m3u8_id=vr.get('playerType')))
+ m3u8_id=player_type, fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
vurl + '?hdcore=3.3.0', video_id,
- f4m_id=vr.get('playerType')))
+ f4m_id=player_type, fatal=False))
+ elif ext == 'mpd':
+ if player_type == 'dashhbbtv':
+ formats.extend(self._extract_mpd_formats(
+ vurl, video_id, mpd_id=player_type, fatal=False))
else:
formats.append({
- 'format_id': vr.get('playerType'),
+ 'format_id': player_type,
'url': vurl,
})
+ if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
+ self.raise_geo_restricted('This video is only available in Sweden')
self._sort_formats(formats)
subtitles = {}
- subtitle_references = video_info.get('subtitleReferences')
+ subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences'))
if isinstance(subtitle_references, list):
for sr in subtitle_references:
subtitle_url = sr.get('url')
+ subtitle_lang = sr.get('language', 'sv')
if subtitle_url:
- subtitles.setdefault('sv', []).append({'url': subtitle_url})
+ if determine_ext(subtitle_url) == 'm3u8':
+ # TODO(yan12125): handle WebVTT in m3u8 manifests
+ continue
+
+ subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url})
- duration = video_info.get('materialLength')
- age_limit = 18 if video_info.get('inappropriateForChildren') else 0
+ title = video_info.get('title')
+
+ series = video_info.get('programTitle')
+ season_number = int_or_none(video_info.get('season'))
+ episode = video_info.get('episodeTitle')
+ episode_number = int_or_none(video_info.get('episodeNumber'))
+
+ duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
+ age_limit = None
+ adult = dict_get(
+ video_info, ('inappropriateForChildren', 'blockedForChildren'),
+ skip_false_values=False)
+ if adult is not None:
+ age_limit = 18 if adult else 0
return {
'id': video_id,
'title': title,
'formats': formats,
'subtitles': subtitles,
- 'thumbnail': thumbnail,
'duration': duration,
'age_limit': age_limit,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
}
@@ -63,11 +87,11 @@ class SVTIE(SVTBaseIE):
_VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)'
_TEST = {
'url': 'http://www.svt.se/wd?widgetId=23991&sectionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false',
- 'md5': '9648197555fc1b49e3dc22db4af51d46',
+ 'md5': '33e9a5d8f646523ce0868ecfb0eed77d',
'info_dict': {
'id': '2900353',
- 'ext': 'flv',
- 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
+ 'ext': 'mp4',
+ 'title': 'Stjärnorna skojar till det - under SVT-intervjun',
'duration': 27,
'age_limit': 0,
},
@@ -84,15 +108,20 @@ class SVTIE(SVTBaseIE):
mobj = re.match(self._VALID_URL, url)
widget_id = mobj.group('widget_id')
article_id = mobj.group('id')
- return self._extract_video(
+
+ info = self._download_json(
'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id),
article_id)
+ info_dict = self._extract_video(info['video'], article_id)
+ info_dict['title'] = info['context']['title']
+ return info_dict
+
class SVTPlayIE(SVTBaseIE):
IE_DESC = 'SVT Play and Öppet arkiv'
- _VALID_URL = r'https?://(?:www\.)?(?P<host>svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
+ _TESTS = [{
'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
'info_dict': {
@@ -108,12 +137,47 @@ class SVTPlayIE(SVTBaseIE):
}]
},
},
- }
+ }, {
+ # geo restricted to Sweden
+ 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- host = mobj.group('host')
- return self._extract_video(
- 'http://www.%s.se/video/%s?output=json' % (host, video_id),
- video_id)
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ data = self._parse_json(
+ self._search_regex(
+ r'root\["__svtplay"\]\s*=\s*([^;]+);',
+ webpage, 'embedded data', default='{}'),
+ video_id, fatal=False)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ if data:
+ video_info = try_get(
+ data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
+ dict)
+ if video_info:
+ info_dict = self._extract_video(video_info, video_id)
+ info_dict.update({
+ 'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
+ 'thumbnail': thumbnail,
+ })
+ return info_dict
+
+ video_id = self._search_regex(
+ r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
+ webpage, 'video id', default=None)
+
+ if video_id:
+ data = self._download_json(
+ 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
+ info_dict = self._extract_video(data, video_id)
+ if not info_dict.get('title'):
+ info_dict['title'] = re.sub(
+ r'\s*\|\s*.+?$', '',
+ info_dict.get('episode') or self._og_search_title(webpage))
+ return info_dict
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py
index aa5964a..f562aa6 100644
--- a/youtube_dl/extractor/sztvhu.py
+++ b/youtube_dl/extractor/sztvhu.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
class SztvHuIE(InfoExtractor):
- _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)'
_TEST = {
'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909',
'md5': 'a6df607b11fb07d0e9f2ad94613375cb',
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
index 73e7657..136e18f 100644
--- a/youtube_dl/extractor/tagesschau.py
+++ b/youtube_dl/extractor/tagesschau.py
@@ -4,43 +4,179 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import parse_filesize
+from ..utils import (
+ determine_ext,
+ js_to_json,
+ parse_iso8601,
+ parse_filesize,
+)
+
+
+class TagesschauPlayerIE(InfoExtractor):
+ IE_NAME = 'tagesschau:player'
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
+
+ _TESTS = [{
+ 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
+ 'md5': '8d09548d5c15debad38bee3a4d15ca21',
+ 'info_dict': {
+ 'id': '179517',
+ 'ext': 'mp4',
+ 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ 'formats': 'mincount:6',
+ },
+ }, {
+ 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
+ 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+ 'info_dict': {
+ 'id': '29417',
+ 'ext': 'mp3',
+ 'title': 'Trabi - Bye, bye Rennpappe',
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ 'formats': 'mincount:2',
+ },
+ }, {
+ 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
+ 'only_matching': True,
+ }]
+
+ _FORMATS = {
+ 'xs': {'quality': 0},
+ 's': {'width': 320, 'height': 180, 'quality': 1},
+ 'm': {'width': 512, 'height': 288, 'quality': 2},
+ 'l': {'width': 960, 'height': 540, 'quality': 3},
+ 'xl': {'width': 1280, 'height': 720, 'quality': 4},
+ 'xxl': {'quality': 5},
+ }
+
+ def _extract_via_api(self, kind, video_id):
+ info = self._download_json(
+ 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
+ video_id)
+ title = info['headline']
+ formats = []
+ for media in info['mediadata']:
+ for format_id, format_url in media.items():
+ if determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls'))
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'vcodec': 'none' if kind == 'audio' else None,
+ })
+ self._sort_formats(formats)
+ timestamp = parse_iso8601(info.get('date'))
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ # kind = mobj.group('kind').lower()
+ # if kind == 'video':
+ # return self._extract_via_api(kind, video_id)
+
+ # JSON api does not provide some audio formats (e.g. ogg) thus
+ # extractiong audio via webpage
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage).strip()
+ formats = []
+
+ for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
+ media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
+ if not media:
+ continue
+ src = media.get('src')
+ if not src:
+ return
+ quality = media.get('quality')
+ kind = media.get('type', '').split('/')[0]
+ ext = determine_ext(src)
+ f = {
+ 'url': src,
+ 'format_id': '%s_%s' % (quality, ext) if quality else ext,
+ 'ext': ext,
+ 'vcodec': 'none' if kind == 'audio' else None,
+ }
+ f.update(self._FORMATS.get(quality, {}))
+ formats.append(f)
+
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
class TagesschauIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html'
+ _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
- 'md5': '917a228bc7df7850783bc47979673a09',
+ 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
'info_dict': {
- 'id': '102143',
+ 'id': 'video-102143',
'ext': 'mp4',
'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
- 'description': 'md5:171feccd9d9b3dd54d05d501568f6359',
+ 'description': '18.07.2015 20:10 Uhr',
'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
'md5': '3c54c1f6243d279b706bde660ceec633',
'info_dict': {
- 'id': '5727',
+ 'id': 'ts-5727',
'ext': 'mp4',
- 'description': 'md5:695c01bfd98b7e313c501386327aea59',
'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
+ 'description': 'md5:695c01bfd98b7e313c501386327aea59',
+ 'thumbnail': 're:^https?:.*\.jpg$',
+ },
+ }, {
+ # exclusive audio
+ 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
+ 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+ 'info_dict': {
+ 'id': 'audio-29417',
+ 'ext': 'mp3',
+ 'title': 'Trabi - Bye, bye Rennpappe',
+ 'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
- 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html',
- 'md5': 'aef45de271c4bf0a5db834aa40bf774c',
+ # audio in article
+ 'url': 'http://www.tagesschau.de/inland/bnd-303.html',
+ 'md5': 'e0916c623e85fc1d2b26b78f299d3958',
'info_dict': {
- 'id': '18407',
+ 'id': 'bnd-303',
'ext': 'mp3',
- 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
- 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich',
+ 'title': 'Viele Baustellen für neuen BND-Chef',
+ 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
+ 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
+ 'info_dict': {
+ 'id': 'afd-parteitag-135',
+ 'title': 'Möchtegern-Underdog mit Machtanspruch',
+ },
+ 'playlist_count': 2,
+ }, {
'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
'only_matching': True,
}, {
@@ -61,88 +197,108 @@ class TagesschauIE(InfoExtractor):
}, {
'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.tagesschau.de/100sekunden/index.html',
+ 'only_matching': True,
+ }, {
+ # playlist article with collapsing sections
+ 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
+ 'only_matching': True,
}]
- _FORMATS = {
- 's': {'width': 256, 'height': 144, 'quality': 1},
- 'm': {'width': 512, 'height': 288, 'quality': 2},
- 'l': {'width': 960, 'height': 544, 'quality': 3},
- }
+ @classmethod
+ def suitable(cls, url):
+ return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
+
+ def _extract_formats(self, download_text, media_kind):
+ links = re.finditer(
+ r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
+ download_text)
+ formats = []
+ for l in links:
+ link_url = l.group('url')
+ if not link_url:
+ continue
+ format_id = self._search_regex(
+ r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
+ default=determine_ext(link_url))
+ format = {
+ 'format_id': format_id,
+ 'url': l.group('url'),
+ 'format_name': l.group('name'),
+ }
+ title = l.group('title')
+ if title:
+ if media_kind.lower() == 'video':
+ m = re.match(
+ r'''(?x)
+ Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
+ (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
+ (?P<vbr>[0-9]+)kbps&\#10;
+ Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
+ Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
+ title)
+ if m:
+ format.update({
+ 'format_note': m.group('audio_desc'),
+ 'vcodec': m.group('vcodec'),
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ 'abr': int(m.group('abr')),
+ 'vbr': int(m.group('vbr')),
+ 'filesize_approx': parse_filesize(m.group('filesize_approx')),
+ })
+ else:
+ m = re.match(
+ r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
+ title)
+ if m:
+ format.update({
+ 'format_note': '%s, %s' % (m.group('format'), m.group('note')),
+ 'vcodec': 'none',
+ 'abr': int(m.group('abr')),
+ })
+ formats.append(format)
+ self._sort_formats(formats)
+ return formats
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('path')
display_id = video_id.lstrip('-')
+
webpage = self._download_webpage(url, display_id)
- player_url = self._html_search_meta(
- 'twitter:player', webpage, 'player URL', default=None)
- if player_url:
- playerpage = self._download_webpage(
- player_url, display_id, 'Downloading player page')
-
- formats = []
- for media in re.finditer(
- r'''(?x)
- (?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url)
- ,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type)
- (?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))?
- ''', playerpage):
- url = media.group('url')
- type_ = media.group('type')
- ext = media.group('ext')
- res = media.group('quality')
- f = {
- 'format_id': '%s_%s' % (res, ext) if res else ext,
- 'url': url,
- 'ext': ext,
- 'vcodec': 'none' if type_ == 'audio' else None,
- }
- f.update(self._FORMATS.get(res, {}))
- formats.append(f)
- thumbnail = self._og_search_thumbnail(playerpage)
- title = self._og_search_title(webpage).strip()
- description = self._og_search_description(webpage).strip()
- else:
+ title = self._html_search_regex(
+ r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
+ webpage, 'title', default=None) or self._og_search_title(webpage)
+
+ DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
+
+ webpage_type = self._og_search_property('type', webpage, default=None)
+ if webpage_type == 'website': # Article
+ entries = []
+ for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
+ r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
+ webpage), 1):
+ entries.append({
+ 'id': '%s-%d' % (display_id, num),
+ 'title': '%s' % entry_title,
+ 'formats': self._extract_formats(download_text, media_kind),
+ })
+ if len(entries) > 1:
+ return self.playlist_result(entries, display_id, title)
+ formats = entries[0]['formats']
+ else: # Assume single video
download_text = self._search_regex(
- r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>',
- webpage, 'download links')
- links = re.finditer(
- r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
- download_text)
- formats = []
- for l in links:
- format_id = self._search_regex(
- r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
- format = {
- 'format_id': format_id,
- 'url': l.group('url'),
- 'format_name': l.group('name'),
- }
- m = re.match(
- r'''(?x)
- Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
- (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
- (?P<vbr>[0-9]+)kbps&\#10;
- Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
- Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
- l.group('title'))
- if m:
- format.update({
- 'format_note': m.group('audio_desc'),
- 'vcodec': m.group('vcodec'),
- 'width': int(m.group('width')),
- 'height': int(m.group('height')),
- 'abr': int(m.group('abr')),
- 'vbr': int(m.group('vbr')),
- 'filesize_approx': parse_filesize(m.group('filesize_approx')),
- })
- formats.append(format)
- thumbnail = self._og_search_thumbnail(webpage)
- description = self._html_search_regex(
- r'(?s)<p class="teasertext">(.*?)</p>',
- webpage, 'description', default=None)
- title = self._html_search_regex(
- r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
+ DOWNLOAD_REGEX, webpage, 'download links', group='links')
+ media_kind = self._search_regex(
+ DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
+ formats = self._extract_formats(download_text, media_kind)
+ thumbnail = self._og_search_thumbnail(webpage)
+ description = self._html_search_regex(
+ r'(?s)<p class="teasertext">(.*?)</p>',
+ webpage, 'description', default=None)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/tdslifeway.py b/youtube_dl/extractor/tdslifeway.py
new file mode 100644
index 0000000..4d1f5c8
--- /dev/null
+++ b/youtube_dl/extractor/tdslifeway.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TDSLifewayIE(InfoExtractor):
+ _VALID_URL = r'https?://tds\.lifeway\.com/v1/trainingdeliverysystem/courses/(?P<id>\d+)/index\.html'
+
+ _TEST = {
+ # From http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers
+ 'url': 'http://tds.lifeway.com/v1/trainingdeliverysystem/courses/3453494717001/index.html?externalRegistration=AssetId%7C34F466F1-78F3-4619-B2AB-A8EFFA55E9E9%21InstanceId%7C0%21UserId%7Caaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa&grouping=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&activity_id=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&content_endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2Fcontent%2F&actor=%7B%22name%22%3A%5B%22Guest%20Guest%22%5D%2C%22account%22%3A%5B%7B%22accountServiceHomePage%22%3A%22http%3A%2F%2Fscorm.lifeway.com%2F%22%2C%22accountName%22%3A%22aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa%22%7D%5D%2C%22objectType%22%3A%22Agent%22%7D&content_token=462a50b2-b6f9-4970-99b1-930882c499fb&registration=93d6ec8e-7f7b-4ed3-bbc8-a857913c0b2a&externalConfiguration=access%7CFREE%21adLength%7C-1%21assignOrgId%7C4AE36F78-299A-425D-91EF-E14A899B725F%21assignOrgParentId%7C%21courseId%7C%21isAnonymous%7Cfalse%21previewAsset%7Cfalse%21previewLength%7C-1%21previewMode%7Cfalse%21royalty%7CFREE%21sessionId%7C671422F9-8E79-48D4-9C2C-4EE6111EA1CD%21trackId%7C&auth=Basic%20OjhmZjk5MDBmLTBlYTMtNDJhYS04YjFlLWE4MWQ3NGNkOGRjYw%3D%3D&endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2F',
+ 'info_dict': {
+ 'id': '3453494717001',
+ 'ext': 'mp4',
+ 'title': 'The Gospel by Numbers',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'upload_date': '20140410',
+ 'description': 'Coming soon from T4G 2014!',
+ 'uploader_id': '2034960640001',
+ 'timestamp': 1397145591,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ 'add_ie': ['BrightcoveNew'],
+ }
+
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2034960640001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ brightcove_id = self._match_id(url)
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py
index e047738..d14d93e 100644
--- a/youtube_dl/extractor/teachingchannel.py
+++ b/youtube_dl/extractor/teachingchannel.py
@@ -11,6 +11,7 @@ class TeachingChannelIE(InfoExtractor):
_TEST = {
'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
+ 'md5': '3d6361864d7cac20b57c8784da17166f',
'info_dict': {
'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
'ext': 'mp4',
@@ -19,9 +20,9 @@ class TeachingChannelIE(InfoExtractor):
'duration': 422.255,
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'add_ie': ['Ooyala'],
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index d1b7264..79a7789 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -16,7 +16,7 @@ from ..compat import compat_ord
class TeamcocoIE(InfoExtractor):
- _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)'
+ _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)'
_TESTS = [
{
'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
@@ -88,7 +88,7 @@ class TeamcocoIE(InfoExtractor):
preload_codes = self._html_search_regex(
r'(function.+)setTimeout\(function\(\)\{playlist',
webpage, 'preload codes')
- base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes)
+ base64_fragments = re.findall(r'"([a-zA-Z0-9+/=]+)"', preload_codes)
base64_fragments.remove('init')
def _check_sequence(cur_fragments):
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index a48d77c..451cde7 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -27,7 +27,7 @@ class TEDIE(InfoExtractor):
'''
_TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
- 'md5': 'fc94ac279feebbce69f21c0c6ee82810',
+ 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
'info_dict': {
'id': '102',
'ext': 'mp4',
@@ -37,21 +37,26 @@ class TEDIE(InfoExtractor):
'consciousness, but that half the time our brains are '
'actively fooling us.'),
'uploader': 'Dan Dennett',
- 'width': 854,
+ 'width': 853,
'duration': 1308,
}
}, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
- 'md5': '226f4fb9c62380d11b7995efa4c87994',
+ 'md5': 'b899ac15e345fb39534d913f7606082b',
'info_dict': {
- 'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
+ 'id': 'tSVI8ta_P4w',
'ext': 'mp4',
'title': 'Vishal Sikka: The beauty and power of algorithms',
'thumbnail': 're:^https?://.+\.jpg',
- 'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
- }
+ 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
+ 'upload_date': '20140122',
+ 'uploader_id': 'TEDInstitute',
+ 'uploader': 'TED Institute',
+ },
+ 'add_ie': ['Youtube'],
}, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
+ 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
'info_dict': {
'id': '1972',
'ext': 'mp4',
@@ -73,7 +78,7 @@ class TEDIE(InfoExtractor):
'add_ie': ['Youtube'],
'info_dict': {
'id': '_ZG8HBuDjgc',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Douglas Adams: Parrots the Universe and Everything',
'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
'uploader': 'University of California Television (UCTV)',
@@ -102,9 +107,9 @@ class TEDIE(InfoExtractor):
}]
_NATIVE_FORMATS = {
- 'low': {'preference': 1, 'width': 320, 'height': 180},
- 'medium': {'preference': 2, 'width': 512, 'height': 288},
- 'high': {'preference': 3, 'width': 854, 'height': 480},
+ 'low': {'width': 320, 'height': 180},
+ 'medium': {'width': 512, 'height': 288},
+ 'high': {'width': 854, 'height': 480},
}
def _extract_info(self, webpage):
@@ -171,15 +176,21 @@ class TEDIE(InfoExtractor):
if finfo:
f.update(finfo)
+ http_url = None
for format_id, resources in talk_info['resources'].items():
if format_id == 'h264':
for resource in resources:
+ h264_url = resource.get('file')
+ if not h264_url:
+ continue
bitrate = int_or_none(resource.get('bitrate'))
formats.append({
- 'url': resource['file'],
+ 'url': h264_url,
'format_id': '%s-%sk' % (format_id, bitrate),
'tbr': bitrate,
})
+ if re.search('\d+k', h264_url):
+ http_url = h264_url
elif format_id == 'rtmp':
streamer = talk_info.get('streamer')
if not streamer:
@@ -195,16 +206,24 @@ class TEDIE(InfoExtractor):
'tbr': int_or_none(resource.get('bitrate')),
})
elif format_id == 'hls':
- hls_formats = self._extract_m3u8_formats(
- resources.get('stream'), video_name, 'mp4', m3u8_id=format_id)
- for f in hls_formats:
- if f.get('format_id') == 'hls-meta':
- continue
- if not f.get('height'):
- f['vcodec'] = 'none'
- else:
- f['acodec'] = 'none'
- formats.extend(hls_formats)
+ formats.extend(self._extract_m3u8_formats(
+ resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
+
+ m3u8_formats = list(filter(
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ formats))
+ if http_url:
+ for m3u8_format in m3u8_formats:
+ bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
+ if not bitrate:
+ continue
+ f = m3u8_format.copy()
+ f.update({
+ 'url': re.sub(r'\d+k', bitrate, http_url),
+ 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(f)
audio_download = talk_info.get('audioDownload')
if audio_download:
@@ -212,7 +231,6 @@ class TEDIE(InfoExtractor):
'url': audio_download,
'format_id': 'audio',
'vcodec': 'none',
- 'preference': -0.5,
})
self._sort_formats(formats)
@@ -254,7 +272,11 @@ class TEDIE(InfoExtractor):
config_json = self._html_search_regex(
r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
- webpage, 'config')
+ webpage, 'config', default=None)
+ if not config_json:
+ embed_url = self._search_regex(
+ r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
+ return self.url_result(self._proto_relative_url(embed_url))
config = json.loads(config_json)['config']
video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url')
diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py
index 4e860db..a29a64b 100644
--- a/youtube_dl/extractor/tele13.py
+++ b/youtube_dl/extractor/tele13.py
@@ -11,7 +11,7 @@ from ..utils import (
class Tele13IE(InfoExtractor):
- _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)'
+ _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)'
_TESTS = [
{
'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py
index a3d05f9..eefecc4 100644
--- a/youtube_dl/extractor/telebruxelles.py
+++ b/youtube_dl/extractor/telebruxelles.py
@@ -1,11 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
class TeleBruxellesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt)/?(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/',
'md5': '59439e568c9ee42fb77588b2096b214f',
@@ -39,18 +41,18 @@ class TeleBruxellesIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
article_id = self._html_search_regex(
- r"<article id=\"post-(\d+)\"", webpage, 'article ID')
+ r"<article id=\"post-(\d+)\"", webpage, 'article ID', default=None)
title = self._html_search_regex(
r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title')
- description = self._og_search_description(webpage)
+ description = self._og_search_description(webpage, default=None)
rtmp_url = self._html_search_regex(
- r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"",
+ r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"',
webpage, 'RTMP url')
- rtmp_url = rtmp_url.replace("\" + \"", "")
+ rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url)
return {
- 'id': article_id,
+ 'id': article_id or display_id,
'display_id': display_id,
'title': title,
'description': description,
diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py
index 2c8e9b9..4b4b740 100644
--- a/youtube_dl/extractor/telecinco.py
+++ b/youtube_dl/extractor/telecinco.py
@@ -5,8 +5,8 @@ import json
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
compat_urllib_parse_unquote,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -74,7 +74,7 @@ class TelecincoIE(InfoExtractor):
info_el = self._download_xml(info_url, episode).find('./video/info')
video_link = info_el.find('videoUrl/link').text
- token_query = compat_urllib_parse.urlencode({'id': video_link})
+ token_query = compat_urllib_parse_urlencode({'id': video_link})
token_info = self._download_json(
embed_data['flashvars']['ov_tk'] + '?' + token_query,
episode,
@@ -82,6 +82,7 @@ class TelecincoIE(InfoExtractor):
)
formats = self._extract_m3u8_formats(
token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native')
+ self._sort_formats(formats)
return {
'id': embed_data['videoId'],
diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py
index 6f8333c..9092e9b 100644
--- a/youtube_dl/extractor/telegraaf.py
+++ b/youtube_dl/extractor/telegraaf.py
@@ -2,14 +2,16 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import remove_end
+from ..utils import (
+ determine_ext,
+ remove_end,
+)
class TelegraafIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
_TEST = {
'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
- 'md5': '83245a9779bcc4a24454bfd53c65b6dc',
'info_dict': {
'id': '24353229',
'ext': 'mp4',
@@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 33,
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
- playlist_id = self._match_id(url)
+ video_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
+ webpage = self._download_webpage(url, video_id)
+ player_url = self._html_search_regex(
+ r'<iframe[^>]+src="([^"]+")', webpage, 'player URL')
+ player_page = self._download_webpage(
+ player_url, video_id, note='Download player webpage')
playlist_url = self._search_regex(
- r"iframe\.loadPlayer\('([^']+)'", webpage, 'player')
+ r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL')
+ playlist_data = self._download_json(playlist_url, video_id)
+
+ item = playlist_data['items'][0]
+ formats = []
+ locations = item['locations']
+ for location in locations.get('adaptive', []):
+ manifest_url = location['src']
+ ext = determine_ext(manifest_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ manifest_url, video_id, ext='mp4', m3u8_id='hls'))
+ elif ext == 'mpd':
+ # TODO: Current DASH formats are broken - $Time$ pattern in
+ # <SegmentTemplate> not implemented yet
+ continue
+ else:
+ self.report_warning('Unknown adaptive format %s' % ext)
+ for location in locations.get('progressive', []):
+ formats.append({
+ 'url': location['sources'][0]['src'],
+ 'width': location.get('width'),
+ 'height': location.get('height'),
+ 'format_id': 'http-%s' % location['label'],
+ })
+
+ self._sort_formats(formats)
- entries = self._extract_xspf_playlist(playlist_url, playlist_id)
title = remove_end(self._og_search_title(webpage), ' - VIDEO')
description = self._og_search_description(webpage)
+ duration = item.get('duration')
+ thumbnail = item.get('poster')
- return self.playlist_result(entries, playlist_id, title, description)
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/telewebion.py b/youtube_dl/extractor/telewebion.py
new file mode 100644
index 0000000..77916c6
--- /dev/null
+++ b/youtube_dl/extractor/telewebion.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TelewebionIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.telewebion\.com/#!/episode/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'http://www.telewebion.com/#!/episode/1263668/',
+ 'info_dict': {
+ 'id': '1263668',
+ 'ext': 'mp4',
+ 'title': 'قرعه\u200cکشی لیگ قهرمانان اروپا',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'view_count': int,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ secure_token = self._download_webpage(
+ 'http://m.s2.telewebion.com/op/op?action=getSecurityToken', video_id)
+ episode_details = self._download_json(
+ 'http://m.s2.telewebion.com/op/op', video_id,
+ query={'action': 'getEpisodeDetails', 'episode_id': video_id})
+
+ m3u8_url = 'http://m.s1.telewebion.com/smil/%s.m3u8?filepath=%s&m3u8=1&secure_token=%s' % (
+ video_id, episode_details['file_path'], secure_token)
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', m3u8_id='hls')
+
+ picture_paths = [
+ episode_details.get('picture_path'),
+ episode_details.get('large_picture_path'),
+ ]
+
+ thumbnails = [{
+ 'url': picture_path,
+ 'preference': idx,
+ } for idx, picture_path in enumerate(picture_paths) if picture_path is not None]
+
+ return {
+ 'id': video_id,
+ 'title': episode_details['title'],
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'view_count': episode_details.get('view_count'),
+ }
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
deleted file mode 100644
index 02a31a6..0000000
--- a/youtube_dl/extractor/tenplay.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- float_or_none,
-)
-
-
-class TenPlayIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+'
- _TEST = {
- 'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way',
- 'info_dict': {
- 'id': '2695695426001',
- 'ext': 'flv',
- 'title': 'TENplay: TV your way',
- 'description': 'Welcome to a new TV experience. Enjoy a taste of the TENplay benefits.',
- 'timestamp': 1380150606.889,
- 'upload_date': '20130925',
- 'uploader': 'TENplay',
- },
- 'params': {
- 'skip_download': True, # Requires rtmpdump
- }
- }
-
- _video_fields = [
- 'id', 'name', 'shortDescription', 'longDescription', 'creationDate',
- 'publishedDate', 'lastModifiedDate', 'customFields', 'videoStillURL',
- 'thumbnailURL', 'referenceId', 'length', 'playsTotal',
- 'playsTrailingWeek', 'renditions', 'captioning', 'startDate', 'endDate']
-
- def _real_extract(self, url):
- webpage = self._download_webpage(url, url)
- video_id = self._html_search_regex(
- r'videoID: "(\d+?)"', webpage, 'video_id')
- api_token = self._html_search_regex(
- r'apiToken: "([a-zA-Z0-9-_\.]+?)"', webpage, 'api_token')
- title = self._html_search_regex(
- r'<meta property="og:title" content="\s*(.*?)\s*"\s*/?\s*>',
- webpage, 'title')
-
- json = self._download_json('https://api.brightcove.com/services/library?command=find_video_by_id&video_id=%s&token=%s&video_fields=%s' % (video_id, api_token, ','.join(self._video_fields)), title)
-
- formats = []
- for rendition in json['renditions']:
- url = rendition['remoteUrl'] or rendition['url']
- protocol = 'rtmp' if url.startswith('rtmp') else 'http'
- ext = 'flv' if protocol == 'rtmp' else rendition['videoContainer'].lower()
-
- if protocol == 'rtmp':
- url = url.replace('&mp4:', '')
-
- tbr = int_or_none(rendition.get('encodingRate'), 1000)
-
- formats.append({
- 'format_id': '_'.join(
- ['rtmp', rendition['videoContainer'].lower(),
- rendition['videoCodec'].lower(), '%sk' % tbr]),
- 'width': int_or_none(rendition['frameWidth']),
- 'height': int_or_none(rendition['frameHeight']),
- 'tbr': tbr,
- 'filesize': int_or_none(rendition['size']),
- 'protocol': protocol,
- 'ext': ext,
- 'vcodec': rendition['videoCodec'].lower(),
- 'container': rendition['videoContainer'].lower(),
- 'url': url,
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'display_id': json['referenceId'],
- 'title': json['name'],
- 'description': json['shortDescription'] or json['longDescription'],
- 'formats': formats,
- 'thumbnails': [{
- 'url': json['videoStillURL']
- }, {
- 'url': json['thumbnailURL']
- }],
- 'thumbnail': json['videoStillURL'],
- 'duration': float_or_none(json.get('length'), 1000),
- 'timestamp': float_or_none(json.get('creationDate'), 1000),
- 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay',
- 'view_count': int_or_none(json.get('playsTotal')),
- }
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index e1a64e2..e595c4a 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player."""
- _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html'
+ _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'
_TESTS = [{
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': {
diff --git a/youtube_dl/extractor/theonion.py b/youtube_dl/extractor/theonion.py
deleted file mode 100644
index 10239c9..0000000
--- a/youtube_dl/extractor/theonion.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-
-
-class TheOnionIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?theonion\.com/video/[^,]+,(?P<id>[0-9]+)/?'
- _TEST = {
- 'url': 'http://www.theonion.com/video/man-wearing-mm-jacket-gods-image,36918/',
- 'md5': '19eaa9a39cf9b9804d982e654dc791ee',
- 'info_dict': {
- 'id': '2133',
- 'ext': 'mp4',
- 'title': 'Man Wearing M&M Jacket Apparently Made In God\'s Image',
- 'description': 'md5:cc12448686b5600baae9261d3e180910',
- 'thumbnail': 're:^https?://.*\.jpg\?\d+$',
- }
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- video_id = self._search_regex(
- r'"videoId":\s(\d+),', webpage, 'video ID')
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
-
- sources = re.findall(r'<source src="([^"]+)" type="([^"]+)"', webpage)
- formats = []
- for src, type_ in sources:
- if type_ == 'video/mp4':
- formats.append({
- 'format_id': 'mp4_sd',
- 'preference': 1,
- 'url': src,
- })
- elif type_ == 'video/webm':
- formats.append({
- 'format_id': 'webm_sd',
- 'preference': 0,
- 'url': src,
- })
- elif type_ == 'application/x-mpegURL':
- formats.extend(
- self._extract_m3u8_formats(src, display_id, preference=-1))
- else:
- self.report_warning(
- 'Encountered unexpected format: %s' % type_)
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
- }
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 93d8715..07d222a 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -8,7 +8,7 @@ import binascii
import hashlib
-from .common import InfoExtractor
+from .once import OnceIE
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
@@ -20,40 +20,43 @@ from ..utils import (
int_or_none,
sanitized_Request,
unsmuggle_url,
+ update_url_query,
xpath_with_ns,
mimetype2ext,
+ find_xpath_attr,
)
default_ns = 'http://www.w3.org/2005/SMIL21/Language'
_x = lambda p: xpath_with_ns(p, {'smil': default_ns})
-class ThePlatformBaseIE(InfoExtractor):
+class ThePlatformBaseIE(OnceIE):
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
- meta = self._download_xml(smil_url, video_id, note=note)
- try:
- error_msg = next(
- n.attrib['abstract']
- for n in meta.findall(_x('.//smil:ref'))
- if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
- except StopIteration:
- pass
- else:
- raise ExtractorError(error_msg, expected=True)
+ meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'})
+ error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
+ if error_element is not None and error_element.attrib['src'].startswith(
+ 'http://link.theplatform.com/s/errorFiles/Unavailable.'):
+ raise ExtractorError(error_element.attrib['abstract'], expected=True)
- formats = self._parse_smil_formats(
+ smil_formats = self._parse_smil_formats(
meta, smil_url, video_id, namespace=default_ns,
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
- for _format in formats:
- ext = determine_ext(_format['url'])
- if ext == 'once':
- _format['ext'] = 'mp4'
+ formats = []
+ for _format in smil_formats:
+ if OnceIE.suitable(_format['url']):
+ formats.extend(self._extract_once_formats(_format['url']))
+ else:
+ media_url = _format['url']
+ if determine_ext(media_url) == 'm3u8':
+ hdnea2 = self._get_cookies(media_url).get('hdnea2')
+ if hdnea2:
+ _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
- self._sort_formats(formats)
+ formats.append(_format)
subtitles = self._parse_smil_subtitles(meta, default_ns)
@@ -79,13 +82,15 @@ class ThePlatformBaseIE(InfoExtractor):
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
'duration': int_or_none(info.get('duration'), 1000),
+ 'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
+ 'uploader': info.get('billingCode'),
}
class ThePlatformIE(ThePlatformBaseIE):
_VALID_URL = r'''(?x)
(?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
- (?:(?P<media>(?:(?:[^/]+/)+select/)?media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
+ (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
|theplatform:)(?P<id>[^/\?&]+)'''
_TESTS = [{
@@ -97,6 +102,9 @@ class ThePlatformIE(ThePlatformBaseIE):
'title': 'Blackberry\'s big, bold Z30',
'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
'duration': 247,
+ 'timestamp': 1383239700,
+ 'upload_date': '20131031',
+ 'uploader': 'CBSI-NEW',
},
'params': {
# rtmp download
@@ -110,6 +118,9 @@ class ThePlatformIE(ThePlatformBaseIE):
'ext': 'flv',
'description': 'md5:ac330c9258c04f9d7512cf26b9595409',
'title': 'Tesla Model S: A second step towards a cleaner motoring future',
+ 'timestamp': 1426176191,
+ 'upload_date': '20150312',
+ 'uploader': 'CBSI-NEW',
},
'params': {
# rtmp download
@@ -122,13 +133,14 @@ class ThePlatformIE(ThePlatformBaseIE):
'ext': 'mp4',
'description': 'md5:644ad9188d655b742f942bf2e06b002d',
'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
+ 'uploader': 'EGSM',
}
}, {
'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
'only_matching': True,
}, {
'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
- 'md5': '734f3790fb5fc4903da391beeebc4836',
+ 'md5': 'fb96bb3d85118930a5b055783a3bd992',
'info_dict': {
'id': 'tdy_or_siri_150701',
'ext': 'mp4',
@@ -138,7 +150,7 @@ class ThePlatformIE(ThePlatformBaseIE):
'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1435752600,
'upload_date': '20150701',
- 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
+ 'uploader': 'NBCU-NEWS',
},
}, {
# From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
@@ -147,6 +159,22 @@ class ThePlatformIE(ThePlatformBaseIE):
'only_matching': True,
}]
+ @classmethod
+ def _extract_urls(cls, webpage):
+ m = re.search(
+ r'''(?x)
+ <meta\s+
+ property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+ content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
+ ''', webpage)
+ if m:
+ return [m.group('url')]
+
+ matches = re.findall(
+ r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+ if matches:
+ return list(zip(*matches))[1]
+
@staticmethod
def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
flags = '10' if include_qs else '00'
@@ -155,11 +183,11 @@ class ThePlatformIE(ThePlatformBaseIE):
def str_to_hex(str):
return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
- def hex_to_str(hex):
- return binascii.a2b_hex(hex)
+ def hex_to_bytes(hex):
+ return binascii.a2b_hex(hex.encode('ascii'))
- relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0]
- clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path))
+ relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1)
+ clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
return '%s&sig=%s' % (url, sig)
@@ -174,10 +202,10 @@ class ThePlatformIE(ThePlatformBaseIE):
if not provider_id:
provider_id = 'dJ5BDC'
- path = provider_id
+ path = provider_id + '/'
if mobj.group('media'):
- path += '/media'
- path += '/' + video_id
+ path += mobj.group('media')
+ path += video_id
qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
if 'guid' in qs_dict:
@@ -216,7 +244,7 @@ class ThePlatformIE(ThePlatformBaseIE):
webpage, 'smil url', group='url')
path = self._search_regex(
r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
- smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL'
+ smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4'
elif mobj.group('config'):
config_url = url + '&form=json'
config_url = config_url.replace('swf/', 'config/')
@@ -226,15 +254,16 @@ class ThePlatformIE(ThePlatformBaseIE):
release_url = config['releaseUrl']
else:
release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
- smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m'
+ smil_url = release_url + '&formats=MPEG4&manifest=f4m'
else:
- smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
+ smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
sig = smuggled_data.get('sig')
if sig:
smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
+ self._sort_formats(formats)
ret = self.get_metadata(path, video_id)
combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
@@ -248,12 +277,12 @@ class ThePlatformIE(ThePlatformBaseIE):
class ThePlatformFeedIE(ThePlatformBaseIE):
- _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
- _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
- _TEST = {
+ _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
+ _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))'
+ _TESTS = [{
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
- 'md5': '22d2b84f058d3586efcd99e57d59d314',
+ 'md5': '6e32495b5073ab414471b615c5ded394',
'info_dict': {
'id': 'n_hardball_5biden_140207',
'ext': 'mp4',
@@ -264,33 +293,40 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
'timestamp': 1391824260,
'duration': 467.0,
'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+ 'uploader': 'NBCU-NEWS',
},
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
-
- video_id = mobj.group('id')
- provider_id = mobj.group('provider_id')
- feed_id = mobj.group('feed_id')
+ }]
- real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
- feed = self._download_json(real_url, video_id)
- entry = feed['entries'][0]
+ def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
+ real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
+ entry = self._download_json(real_url, video_id)['entries'][0]
formats = []
subtitles = {}
first_video_id = None
duration = None
+ asset_types = []
for item in entry['media$content']:
- smil_url = item['plfile$url'] + '&format=SMIL&mbr=true'
+ smil_url = item['plfile$url']
cur_video_id = ThePlatformIE._match_id(smil_url)
if first_video_id is None:
first_video_id = cur_video_id
duration = float_or_none(item.get('plfile$duration'))
- cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
- formats.extend(cur_formats)
- subtitles = self._merge_subtitles(subtitles, cur_subtitles)
+ for asset_type in item['plfile$assetTypes']:
+ if asset_type in asset_types:
+ continue
+ asset_types.append(asset_type)
+ query = {
+ 'mbr': 'true',
+ 'formats': item['plfile$format'],
+ 'assetTypes': asset_type,
+ }
+ if asset_type in asset_types_query:
+ query.update(asset_types_query[asset_type])
+ cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
+ smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+ formats.extend(cur_formats)
+ subtitles = self._merge_subtitles(subtitles, cur_subtitles)
self._sort_formats(formats)
@@ -314,5 +350,17 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
'timestamp': timestamp,
'categories': categories,
})
+ if custom_fields:
+ ret.update(custom_fields(entry))
return ret
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+
+ video_id = mobj.group('id')
+ provider_id = mobj.group('provider_id')
+ feed_id = mobj.group('feed_id')
+ filter_query = mobj.group('filter')
+
+ return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)
diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py
new file mode 100644
index 0000000..3e4e140
--- /dev/null
+++ b/youtube_dl/extractor/thescene.py
@@ -0,0 +1,52 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+from ..compat import compat_urlparse
+from ..utils import qualities
+
+
+class TheSceneIE(InfoExtractor):
+ _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)'
+
+ _TEST = {
+ 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear',
+ 'info_dict': {
+ 'id': '520e8faac2b4c00e3c6e5f43',
+ 'ext': 'mp4',
+ 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear',
+ 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear',
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ player_url = compat_urlparse.urljoin(
+ url,
+ self._html_search_regex(
+ r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url'))
+
+ player = self._download_webpage(player_url, display_id)
+ info = self._parse_json(
+ self._search_regex(
+ r'(?m)var\s+video\s+=\s+({.+?});$', player, 'info json'),
+ display_id)
+
+ qualities_order = qualities(('low', 'high'))
+ formats = [{
+ 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']),
+ 'url': f['src'],
+ 'quality': qualities_order(f['quality']),
+ } for f in info['sources'][0]]
+ self._sort_formats(formats)
+
+ return {
+ 'id': info['id'],
+ 'display_id': display_id,
+ 'title': info['title'],
+ 'formats': formats,
+ 'thumbnail': info.get('poster_frame'),
+ }
diff --git a/youtube_dl/extractor/thesixtyone.py b/youtube_dl/extractor/thesixtyone.py
index d8b1fd2..d63aef5 100644
--- a/youtube_dl/extractor/thesixtyone.py
+++ b/youtube_dl/extractor/thesixtyone.py
@@ -12,7 +12,7 @@ class TheSixtyOneIE(InfoExtractor):
s|
song/comments/list|
song
- )/(?P<id>[A-Za-z0-9]+)/?$'''
+ )/(?:[^/]+/)?(?P<id>[A-Za-z0-9]+)/?$'''
_SONG_URL_TEMPLATE = 'http://thesixtyone.com/s/{0:}'
_SONG_FILE_URL_TEMPLATE = 'http://{audio_server:}/thesixtyone_production/audio/{0:}_stream'
_THUMBNAIL_URL_TEMPLATE = '{photo_base_url:}_desktop'
@@ -45,6 +45,10 @@ class TheSixtyOneIE(InfoExtractor):
'url': 'http://www.thesixtyone.com/song/SrE3zD7s1jt/',
'only_matching': True,
},
+ {
+ 'url': 'http://www.thesixtyone.com/maryatmidnight/song/StrawberriesandCream/yvWtLp0c4GQ/',
+ 'only_matching': True,
+ },
]
_DECODE_MAP = {
diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py
new file mode 100644
index 0000000..ba1380a
--- /dev/null
+++ b/youtube_dl/extractor/thestar.py
@@ -0,0 +1,35 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveLegacyIE
+from ..compat import compat_parse_qs
+
+
+class TheStarIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P<id>.+)\.html'
+ _TEST = {
+ 'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html',
+ 'md5': '2c62dd4db2027e35579fefb97a8b6554',
+ 'info_dict': {
+ 'id': '4732393888001',
+ 'ext': 'mp4',
+ 'title': 'Mankind: Why this woman started a men\'s skin care line',
+ 'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.',
+ 'uploader_id': '794267642001',
+ 'timestamp': 1454353482,
+ 'upload_date': '20160201',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
+ brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0]
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py
new file mode 100644
index 0000000..c77a079
--- /dev/null
+++ b/youtube_dl/extractor/threeqsdn.py
@@ -0,0 +1,139 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ js_to_json,
+ mimetype2ext,
+)
+
+
+class ThreeQSDNIE(InfoExtractor):
+ IE_NAME = '3qsdn'
+ IE_DESC = '3Q SDN'
+ _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _TESTS = [{
+ # ondemand from http://www.philharmonie.tv/veranstaltung/26/
+ 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
+ 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd',
+ 'info_dict': {
+ 'id': '0280d6b9-1215-11e6-b427-0cc47a188158',
+ 'ext': 'mp4',
+ 'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
+ 'is_live': False,
+ },
+ 'expected_warnings': ['Failed to download MPD manifest'],
+ }, {
+ # live video stream
+ 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+ 'info_dict': {
+ 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+ 'ext': 'mp4',
+ 'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+ 'is_live': False,
+ },
+ }, {
+ # live audio stream
+ 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
+ 'only_matching': True,
+ }, {
+ # live audio stream with some 404 URLs
+ 'url': 'http://playout.3qsdn.com/ac5c3186-777a-11e2-9c30-9acf09e2db48',
+ 'only_matching': True,
+ }, {
+ # geo restricted with 'This content is not available in your country'
+ 'url': 'http://playout.3qsdn.com/d63a3ffe-75e8-11e2-9c30-9acf09e2db48',
+ 'only_matching': True,
+ }, {
+ # geo restricted with 'playout.3qsdn.com/forbidden'
+ 'url': 'http://playout.3qsdn.com/8e330f26-6ae2-11e2-a16a-9acf09e2db48',
+ 'only_matching': True,
+ }, {
+ # live video with rtmp link
+ 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage)
+ if mobj:
+ return mobj.group('url')
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ js = self._download_webpage(
+ 'http://playout.3qsdn.com/%s' % video_id, video_id,
+ query={'js': 'true'})
+
+ if any(p in js for p in (
+ '>This content is not available in your country',
+ 'playout.3qsdn.com/forbidden')):
+ self.raise_geo_restricted()
+
+ stream_content = self._search_regex(
+ r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js,
+ 'stream content', default='demand', group='content')
+
+ live = stream_content == 'live'
+
+ stream_type = self._search_regex(
+ r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js,
+ 'stream type', default='video', group='type')
+
+ formats = []
+ urls = set()
+
+ def extract_formats(item_url, item={}):
+ if not item_url or item_url in urls:
+ return
+ urls.add(item_url)
+ type_ = item.get('type')
+ ext = determine_ext(item_url, default_ext=None)
+ if type_ == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ item_url, video_id, mpd_id='mpd', fatal=False))
+ elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ item_url, video_id, 'mp4',
+ entry_protocol='m3u8' if live else 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ item_url, video_id, f4m_id='hds', fatal=False))
+ else:
+ if not self._is_valid_url(item_url, video_id):
+ return
+ formats.append({
+ 'url': item_url,
+ 'format_id': item.get('quality'),
+ 'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext,
+ 'vcodec': 'none' if stream_type == 'audio' else None,
+ })
+
+ for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js):
+ f = self._parse_json(
+ item_js, video_id, transform_source=js_to_json, fatal=False)
+ if not f:
+ continue
+ extract_formats(f.get('src'), f)
+
+ # More relaxed version to collect additional URLs and acting
+ # as a future-proof fallback
+ for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js):
+ extract_formats(src)
+
+ self._sort_formats(formats)
+
+ title = self._live_title(video_id) if live else video_id
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'is_live': live,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py
index 496f15d..406f4a8 100644
--- a/youtube_dl/extractor/thvideo.py
+++ b/youtube_dl/extractor/thvideo.py
@@ -10,7 +10,7 @@ from ..utils import (
class THVideoIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)'
_TEST = {
'url': 'http://thvideo.tv/v/th1987/',
'md5': 'fa107b1f73817e325e9433505a70db50',
diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py
index e036b8c..c43cace 100644
--- a/youtube_dl/extractor/tinypic.py
+++ b/youtube_dl/extractor/tinypic.py
@@ -9,7 +9,7 @@ from ..utils import ExtractorError
class TinyPicIE(InfoExtractor):
IE_NAME = 'tinypic'
IE_DESC = 'tinypic.com videos'
- _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
+ _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+'
_TESTS = [
{
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
index adc05ed..abad3ff 100644
--- a/youtube_dl/extractor/tlc.py
+++ b/youtube_dl/extractor/tlc.py
@@ -4,12 +4,12 @@ import re
from .common import InfoExtractor
from .brightcove import BrightcoveLegacyIE
-from ..compat import compat_urlparse
+from ..compat import compat_parse_qs
class TlcDeIE(InfoExtractor):
IE_NAME = 'tlc.de'
- _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)'
+ _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?'
_TEST = {
'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
@@ -17,32 +17,23 @@ class TlcDeIE(InfoExtractor):
'id': '3235167922001',
'ext': 'mp4',
'title': 'Breaking Amish: Die Welt da draußen',
- 'uploader': 'Discovery Networks - Germany',
'description': (
'Vier Amische und eine Mennonitin wagen in New York'
' den Sprung in ein komplett anderes Leben. Begleitet sie auf'
' ihrem spannenden Weg.'),
+ 'timestamp': 1396598084,
+ 'upload_date': '20140404',
+ 'uploader_id': '1659832546',
},
}
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- iframe_url = self._search_regex(
- '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage,
- 'iframe url')
- # Otherwise we don't get the correct 'BrightcoveExperience' element,
- # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/
- iframe_url = iframe_url.replace('.htm?', '.php?')
- url_fragment = compat_urlparse.urlparse(url).fragment
- if url_fragment:
- # Since the fragment is not send to the server, we always get the same iframe
- iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url)
- iframe = self._download_webpage(iframe_url, title)
-
- return {
- '_type': 'url',
- 'url': BrightcoveLegacyIE._extract_brightcove_url(iframe),
- 'ie': BrightcoveLegacyIE.ie_key(),
- }
+ brightcove_id = mobj.group('id')
+ if not brightcove_id:
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
+ brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0]
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py
index 49516ab..7817417 100644
--- a/youtube_dl/extractor/tnaflix.py
+++ b/youtube_dl/extractor/tnaflix.py
@@ -71,12 +71,16 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- display_id = mobj.group('display_id')
+ display_id = mobj.group('display_id') if 'display_id' in mobj.groupdict() else video_id
webpage = self._download_webpage(url, display_id)
cfg_url = self._proto_relative_url(self._html_search_regex(
- self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:')
+ self._CONFIG_REGEX, webpage, 'flashvars.config', default=None), 'http:')
+
+ if not cfg_url:
+ inputs = self._hidden_inputs(webpage)
+ cfg_url = 'https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s' % (inputs['vkey'], inputs['nkey'])
cfg_xml = self._download_xml(
cfg_url, display_id, 'Downloading metadata',
@@ -117,7 +121,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
title = self._html_search_regex(
self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
- age_limit = self._rta_search(webpage)
+ age_limit = self._rta_search(webpage) or 18
duration = parse_duration(self._html_search_meta(
'duration', webpage, 'duration', default=None))
@@ -132,7 +136,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating'))
categories_str = extract_field(self._CATEGORIES_REGEX, 'categories')
- categories = categories_str.split(', ') if categories_str is not None else []
+ categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else []
return {
'id': video_id,
@@ -152,17 +156,48 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
}
+class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
+ _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)'
+
+ _TITLE_REGEX = r'<title>([^<]+)</title>'
+
+ _TESTS = [{
+ 'url': 'https://player.tnaflix.com/video/6538',
+ 'info_dict': {
+ 'id': '6538',
+ 'display_id': '6538',
+ 'ext': 'mp4',
+ 'title': 'Educational xxx video',
+ 'thumbnail': 're:https?://.*\.jpg$',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://player.empflix.com/video/33051',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1',
+ webpage)]
+
+
class TNAFlixIE(TNAFlixNetworkBaseIE):
_VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
_TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
- _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
- _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div'
+ _DESCRIPTION_REGEX = r'<meta[^>]+name="description"[^>]+content="([^"]+)"'
+ _UPLOADER_REGEX = r'<i>\s*Verified Member\s*</i>\s*<h1>(.+?)</h1>'
+ _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>'
_TESTS = [{
# anonymous uploader, no categories
'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
- 'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+ 'md5': '7e569419fe6d69543d01e6be22f5f7c4',
'info_dict': {
'id': '553878',
'display_id': 'Carmella-Decesare-striptease',
@@ -171,17 +206,16 @@ class TNAFlixIE(TNAFlixNetworkBaseIE):
'thumbnail': 're:https?://.*\.jpg$',
'duration': 91,
'age_limit': 18,
- 'uploader': 'Anonymous',
- 'categories': [],
+ 'categories': ['Porn Stars'],
}
}, {
# non-anonymous uploader, categories
'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538',
- 'md5': '0f5d4d490dbfd117b8607054248a07c0',
+ 'md5': 'fcba2636572895aba116171a899a5658',
'info_dict': {
'id': '6538',
'display_id': 'Educational-xxx-video',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Educational xxx video',
'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8',
'thumbnail': 're:https?://.*\.jpg$',
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py
index 2756f56..2579ba8 100644
--- a/youtube_dl/extractor/toypics.py
+++ b/youtube_dl/extractor/toypics.py
@@ -41,7 +41,7 @@ class ToypicsIE(InfoExtractor):
class ToypicsUserIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
- _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+ _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
_TEST = {
'url': 'http://videos.toypics.net/Mikey',
'info_dict': {
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py
index 0e01b15..747370d 100644
--- a/youtube_dl/extractor/traileraddict.py
+++ b/youtube_dl/extractor/traileraddict.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
class TrailerAddictIE(InfoExtractor):
_WORKING = False
- _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
_TEST = {
'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer',
'md5': '41365557f3c8c397d091da510e73ceb4',
diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py
index d239949..6577056 100644
--- a/youtube_dl/extractor/trollvids.py
+++ b/youtube_dl/extractor/trollvids.py
@@ -7,7 +7,7 @@ from .nuevo import NuevoBaseIE
class TrollvidsIE(NuevoBaseIE):
- _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
IE_NAME = 'trollvids'
_TEST = {
'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff',
diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py
index 6d78b5d..c6572de 100644
--- a/youtube_dl/extractor/tubitv.py
+++ b/youtube_dl/extractor/tubitv.py
@@ -1,31 +1,32 @@
# coding: utf-8
from __future__ import unicode_literals
-import codecs
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
int_or_none,
sanitized_Request,
+ urlencode_postdata,
+ parse_iso8601,
)
class TubiTvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video/(?P<id>[0-9]+)'
_LOGIN_URL = 'http://tubitv.com/login'
_NETRC_MACHINE = 'tubitv'
_TEST = {
- 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01',
+ 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday',
'info_dict': {
- 'id': '54411',
+ 'id': '283829',
'ext': 'mp4',
- 'title': 'The Kitchen Musical - EP01',
- 'thumbnail': 're:^https?://.*\.png$',
- 'description': 'md5:37532716166069b353e8866e71fefae7',
- 'duration': 2407,
+ 'title': 'The Comedian at The Friday',
+ 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.',
+ 'uploader': 'Indie Rights Films',
+ 'upload_date': '20160111',
+ 'timestamp': 1452555979,
},
'params': {
'skip_download': 'HLS download',
@@ -41,7 +42,7 @@ class TubiTvIE(InfoExtractor):
'username': username,
'password': password,
}
- payload = compat_urllib_parse.urlencode(form_data).encode('utf-8')
+ payload = urlencode_postdata(form_data)
request = sanitized_Request(self._LOGIN_URL, payload)
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_page = self._download_webpage(
@@ -55,26 +56,31 @@ class TubiTvIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ video_data = self._download_json(
+ 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id)
+ title = video_data['n']
- webpage = self._download_webpage(url, video_id)
- if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage):
- self.raise_login_required('This video requires login')
+ formats = self._extract_m3u8_formats(
+ video_data['mh'], video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
- duration = int_or_none(self._html_search_meta(
- 'video:duration', webpage, 'duration'))
-
- apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu')
- m3u8_url = codecs.decode(apu, 'rot_13')[::-1]
- formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
+ subtitles = {}
+ for sub in video_data.get('sb', []):
+ sub_url = sub.get('u')
+ if not sub_url:
+ continue
+ subtitles.setdefault(sub.get('l', 'en'), []).append({
+ 'url': sub_url,
+ })
return {
'id': video_id,
'title': title,
'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
- 'duration': duration,
+ 'subtitles': subtitles,
+ 'thumbnail': video_data.get('ph'),
+ 'description': video_data.get('d'),
+ 'duration': int_or_none(video_data.get('s')),
+ 'timestamp': parse_iso8601(video_data.get('u')),
+ 'uploader': video_data.get('on'),
}
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py
index f56b66d..bb8b8e2 100644
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -5,7 +5,9 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
+ ExtractorError,
int_or_none,
+ InAdvancePagedList,
float_or_none,
unescapeHTML,
)
@@ -45,11 +47,27 @@ class TudouIE(InfoExtractor):
_PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
+ # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf
+ # 0001, 0002 and 4001 are not included as they indicate temporary issues
+ TVC_ERRORS = {
+ '0003': 'The video is deleted or does not exist',
+ '1001': 'This video is unavailable due to licensing issues',
+ '1002': 'This video is unavailable as it\'s under review',
+ '1003': 'This video is unavailable as it\'s under review',
+ '3001': 'Password required',
+ '5001': 'This video is available in Mainland China only due to licensing issues',
+ '7001': 'This video is unavailable',
+ '8001': 'This video is unavailable due to licensing issues',
+ }
+
def _url_for_id(self, video_id, quality=None):
info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
if quality:
info_url += '&hd' + quality
xml_data = self._download_xml(info_url, video_id, 'Opening the info XML page')
+ error = xml_data.attrib.get('error')
+ if error is not None:
+ raise ExtractorError('Tudou said: %s' % error, expected=True)
final_url = xml_data.text
return final_url
@@ -62,6 +80,15 @@ class TudouIE(InfoExtractor):
if youku_vcode:
return self.url_result('youku:' + youku_vcode, ie='Youku')
+ if not item_data.get('itemSegs'):
+ tvc_code = item_data.get('tvcCode')
+ if tvc_code:
+ err_msg = self.TVC_ERRORS.get(tvc_code)
+ if err_msg:
+ raise ExtractorError('Tudou said: %s' % err_msg, expected=True)
+ raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code)
+ raise ExtractorError('Unxpected error returned from Tudou')
+
title = unescapeHTML(item_data['kw'])
description = item_data.get('desc')
thumbnail_url = item_data.get('pic')
@@ -75,15 +102,16 @@ class TudouIE(InfoExtractor):
quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
key=lambda k: int(k))[-1]
parts = segments[quality]
- result = []
len_parts = len(parts)
if len_parts > 1:
self.to_screen('%s: found %s parts' % (video_id, len_parts))
- for part in parts:
+
+ def part_func(partnum):
+ part = parts[partnum]
part_id = part['k']
final_url = self._url_for_id(part_id, quality)
ext = (final_url.split('?')[0]).split('.')[-1]
- part_info = {
+ return [{
'id': '%s' % part_id,
'url': final_url,
'ext': ext,
@@ -97,12 +125,13 @@ class TudouIE(InfoExtractor):
'http_headers': {
'Referer': self._PLAYER_URL,
},
- }
- result.append(part_info)
+ }]
+
+ entries = InAdvancePagedList(part_func, len_parts, 1)
return {
'_type': 'multi_video',
- 'entries': result,
+ 'entries': entries,
'id': video_id,
'title': title,
}
diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py
index 4f84470..4d8b571 100644
--- a/youtube_dl/extractor/tumblr.py
+++ b/youtube_dl/extractor/tumblr.py
@@ -8,7 +8,7 @@ from ..utils import int_or_none
class TumblrIE(InfoExtractor):
- _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
+ _VALID_URL = r'https?://(?P<blog_name>[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])'
_TESTS = [{
'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes',
'md5': '479bb068e5b16462f5176a6828829767',
@@ -67,6 +67,34 @@ class TumblrIE(InfoExtractor):
'uploader_id': 'user32021558',
},
'add_ie': ['Vimeo'],
+ }, {
+ 'url': 'http://sutiblr.tumblr.com/post/139638707273',
+ 'md5': '2dd184b3669e049ba40563a7d423f95c',
+ 'info_dict': {
+ 'id': 'ir7qBEIKqvq',
+ 'ext': 'mp4',
+ 'title': 'Vine by sutiblr',
+ 'alt_title': 'Vine by sutiblr',
+ 'uploader': 'sutiblr',
+ 'uploader_id': '1198993975374495744',
+ 'upload_date': '20160220',
+ 'like_count': int,
+ 'comment_count': int,
+ 'repost_count': int,
+ },
+ 'add_ie': ['Vine'],
+ }, {
+ 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or',
+ 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72',
+ 'info_dict': {
+ 'id': '-7LnUPGlSo',
+ 'ext': 'mp4',
+ 'title': 'Video by victoriassecret',
+ 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat',
+ 'uploader_id': 'victoriassecret',
+ 'thumbnail': 're:^https?://.*\.jpg'
+ },
+ 'add_ie': ['Instagram'],
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py
index 8322cc1..ae4cfae 100644
--- a/youtube_dl/extractor/tunein.py
+++ b/youtube_dl/extractor/tunein.py
@@ -1,7 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
+import re
from .common import InfoExtractor
from ..utils import ExtractorError
@@ -27,10 +27,9 @@ class TuneInBaseIE(InfoExtractor):
if not streams_url.startswith('http://'):
streams_url = compat_urlparse.urljoin(url, streams_url)
- stream_data = self._download_webpage(
- streams_url, content_id, note='Downloading stream data')
- streams = json.loads(self._search_regex(
- r'\((.*)\);', stream_data, 'stream info'))['Streams']
+ streams = self._download_json(
+ streams_url, content_id, note='Downloading stream data',
+ transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams']
is_live = None
formats = []
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
index 1457e52..86bb791 100644
--- a/youtube_dl/extractor/tv2.py
+++ b/youtube_dl/extractor/tv2.py
@@ -14,7 +14,7 @@ from ..utils import (
class TV2IE(InfoExtractor):
- _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
_TEST = {
'url': 'http://www.tv2.no/v/916509/',
'info_dict': {
@@ -100,7 +100,7 @@ class TV2IE(InfoExtractor):
class TV2ArticleIE(InfoExtractor):
- _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
'info_dict': {
diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py
new file mode 100644
index 0000000..3867ec9
--- /dev/null
+++ b/youtube_dl/extractor/tv3.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class TV3IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P<id>[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx'
+ _TEST = {
+ 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx',
+ 'info_dict': {
+ 'id': '4659127992001',
+ 'ext': 'mp4',
+ 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3',
+ 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.',
+ 'uploader_id': '3812193411001',
+ 'upload_date': '20151213',
+ 'timestamp': 1449975272,
+ },
+ 'expected_warnings': [
+ 'Failed to download MPD manifest'
+ ],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ brightcove_id = self._search_regex(r'<param\s*name="@videoPlayer"\s*value="(\d+)"', webpage, 'brightcove id')
+ return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py
index 3a4f393..4065354 100644
--- a/youtube_dl/extractor/tvc.py
+++ b/youtube_dl/extractor/tvc.py
@@ -11,7 +11,7 @@ from ..utils import (
class TVCIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)'
_TEST = {
'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702',
'md5': 'bbc5ff531d1e90e856f60fc4b3afd708',
@@ -64,7 +64,7 @@ class TVCIE(InfoExtractor):
class TVCArticleIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)'
_TESTS = [{
'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/',
'info_dict': {
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
index dc3a833..ead4c00 100644
--- a/youtube_dl/extractor/tvigle.py
+++ b/youtube_dl/extractor/tvigle.py
@@ -58,7 +58,9 @@ class TvigleIE(InfoExtractor):
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._html_search_regex(
- r'class="video-preview current_playing" id="(\d+)">',
+ (r'<div[^>]+class=["\']player["\'][^>]+id=["\'](\d+)',
+ r'var\s+cloudId\s*=\s*["\'](\d+)',
+ r'class="video-preview current_playing" id="(\d+)"'),
webpage, 'video id')
video_data = self._download_json(
@@ -81,10 +83,10 @@ class TvigleIE(InfoExtractor):
formats = []
for vcodec, fmts in item['videos'].items():
+ if vcodec == 'hls':
+ continue
for format_id, video_url in fmts.items():
if format_id == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', m3u8_id=vcodec))
continue
height = self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None)
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index f57d609..5070082 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -1,25 +1,24 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ clean_html,
+ get_element_by_attribute,
+ ExtractorError,
+)
-class TvpIE(InfoExtractor):
- IE_NAME = 'tvp.pl'
- _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$'
+class TVPIE(InfoExtractor):
+ IE_NAME = 'tvp'
+ IE_DESC = 'Telewizja Polska'
+ _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
- 'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
- 'info_dict': {
- 'id': '4278035',
- 'ext': 'wmv',
- 'title': 'Ogniem i mieczem, odc. 2',
- },
- }, {
- 'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
+ 'url': 'http://vod.tvp.pl/194536/i-seria-odc-13',
'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
'info_dict': {
'id': '194536',
@@ -28,7 +27,7 @@ class TvpIE(InfoExtractor):
},
}, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
- 'md5': 'c3b15ed1af288131115ff17a17c19dda',
+ 'md5': 'b0005b542e5b4de643a9690326ab1257',
'info_dict': {
'id': '17916176',
'ext': 'mp4',
@@ -36,12 +35,22 @@ class TvpIE(InfoExtractor):
},
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
- 'md5': 'c3b15ed1af288131115ff17a17c19dda',
- 'info_dict': {
- 'id': '17834272',
- 'ext': 'mp4',
- 'title': 'Na sygnale, odc. 39',
- },
+ 'only_matching': True,
+ }, {
+ 'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -50,6 +59,11 @@ class TvpIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
+ error_massage = get_element_by_attribute('class', 'msg error', webpage)
+ if error_massage:
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, clean_html(error_massage)), expected=True)
+
title = self._search_regex(
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
webpage, 'title', group='title')
@@ -63,24 +77,50 @@ class TvpIE(InfoExtractor):
r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
video_url = self._search_regex(
- r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
- if not video_url:
+ r'0:{src:([\'"])(?P<url>.*?)\1', webpage,
+ 'formats', group='url', default=None)
+ if not video_url or 'material_niedostepny.mp4' in video_url:
video_url = self._download_json(
'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
video_id)['video_url']
- ext = video_url.rsplit('.', 1)[-1]
- if ext != 'ism/manifest':
- if '/' in ext:
- ext = 'mp4'
+ formats = []
+ video_url_base = self._search_regex(
+ r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
+ video_url, 'video base url', default=None)
+ if video_url_base:
+ # TODO: Current DASH formats are broken - $Time$ pattern in
+ # <SegmentTemplate> not implemented yet
+ # formats.extend(self._extract_mpd_formats(
+ # video_url_base + '.ism/video.mpd',
+ # video_id, mpd_id='dash', fatal=False))
+ formats.extend(self._extract_f4m_formats(
+ video_url_base + '.ism/video.f4m',
+ video_id, f4m_id='hds', fatal=False))
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url_base + '.ism/video.m3u8', video_id,
+ 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ self._sort_formats(m3u8_formats)
+ m3u8_formats = list(filter(
+ lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ m3u8_formats))
+ formats.extend(m3u8_formats)
+ for i, m3u8_format in enumerate(m3u8_formats, 2):
+ http_url = '%s-%d.mp4' % (video_url_base, i)
+ if self._is_valid_url(http_url, video_id):
+ f = m3u8_format.copy()
+ f.update({
+ 'url': http_url,
+ 'format_id': f['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(f)
+ else:
formats = [{
'format_id': 'direct',
'url': video_url,
- 'ext': ext,
+ 'ext': determine_ext(video_url, 'mp4'),
}]
- else:
- m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url)
- formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
self._sort_formats(formats)
@@ -92,8 +132,8 @@ class TvpIE(InfoExtractor):
}
-class TvpSeriesIE(InfoExtractor):
- IE_NAME = 'tvp.pl:Series'
+class TVPSeriesIE(InfoExtractor):
+ IE_NAME = 'tvp:series'
_VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
_TESTS = [{
@@ -127,7 +167,7 @@ class TvpSeriesIE(InfoExtractor):
videos_paths = re.findall(
'(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
entries = [
- self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key())
+ self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
for v_path in videos_paths]
return {
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index b4683de..df70a6b 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -13,7 +13,7 @@ from ..utils import (
class TVPlayIE(InfoExtractor):
IE_DESC = 'TV3Play and related services'
- _VALID_URL = r'''(?x)http://(?:www\.)?
+ _VALID_URL = r'''(?x)https?://(?:www\.)?
(?:tvplay\.lv/parraides|
tv3play\.lt/programos|
play\.tv3\.lt/programos|
diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py
index e03e2db..4025edf 100644
--- a/youtube_dl/extractor/twentyfourvideo.py
+++ b/youtube_dl/extractor/twentyfourvideo.py
@@ -47,7 +47,8 @@ class TwentyFourVideoIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._html_search_regex(
- r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False)
+ r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>',
+ webpage, 'description', fatal=False, group='description')
thumbnail = self._og_search_thumbnail(webpage)
duration = int_or_none(self._og_search_property(
'duration', webpage, 'duration', fatal=False))
diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py
index ca7d953..b721ecb 100644
--- a/youtube_dl/extractor/twentymin.py
+++ b/youtube_dl/extractor/twentymin.py
@@ -32,7 +32,22 @@ class TwentyMinutenIE(InfoExtractor):
'title': '«Wir müssen mutig nach vorne schauen»',
'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.',
'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg'
- }
+ },
+ 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.',
+ }, {
+ # YouTube embed
+ 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184',
+ 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f',
+ 'info_dict': {
+ 'id': 'ivM7A7SpDOs',
+ 'ext': 'mp4',
+ 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016',
+ 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a',
+ 'upload_date': '20160424',
+ 'uploader': 'RTVCM Castilla-La Mancha',
+ 'uploader_id': 'RTVCM',
+ },
+ 'add_ie': ['Youtube'],
}, {
'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738',
'only_matching': True,
@@ -48,6 +63,12 @@ class TwentyMinutenIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"',
+ webpage, 'YouTube embed URL', default=None)
+ if youtube_url is not None:
+ return self.url_result(youtube_url, 'Youtube')
+
title = self._html_search_regex(
r'<h1>.*?<span>(.+?)</span></h1>',
webpage, 'title', default=None)
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index 69882da..2091977 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -9,17 +9,19 @@ from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_str,
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urlparse,
)
from ..utils import (
- encode_dict,
ExtractorError,
int_or_none,
+ js_to_json,
+ orderedSet,
parse_duration,
parse_iso8601,
sanitized_Request,
+ urlencode_postdata,
)
@@ -81,7 +83,7 @@ class TwitchBaseIE(InfoExtractor):
post_url = compat_urlparse.urljoin(redirect_url, post_url)
request = sanitized_Request(
- post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8'))
+ post_url, urlencode_postdata(login_form))
request.add_header('Referer', redirect_url)
response = self._download_webpage(
request, None, 'Logging in as %s' % username)
@@ -170,6 +172,7 @@ class TwitchVideoIE(TwitchItemBaseIE):
'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG',
},
'playlist_mincount': 12,
+ 'skip': 'HTTP Error 404: Not Found',
}
@@ -186,6 +189,7 @@ class TwitchChapterIE(TwitchItemBaseIE):
'title': 'ACRL Off Season - Sports Cars @ Nordschleife',
},
'playlist_mincount': 3,
+ 'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361',
'only_matching': True,
@@ -249,14 +253,15 @@ class TwitchVodIE(TwitchItemBaseIE):
formats = self._extract_m3u8_formats(
'%s/vod/%s?%s' % (
self._USHER_BASE, item_id,
- compat_urllib_parse.urlencode({
+ compat_urllib_parse_urlencode({
'allow_source': 'true',
+ 'allow_audio_only': 'true',
'allow_spectre': 'true',
'player': 'twitchweb',
'nauth': access_token['token'],
'nauthsig': access_token['sig'],
})),
- item_id, 'mp4')
+ item_id, 'mp4', entry_protocol='m3u8_native')
self._prefer_source(formats)
info['formats'] = formats
@@ -281,17 +286,37 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):
entries = []
offset = 0
limit = self._PAGE_LIMIT
+ broken_paging_detected = False
+ counter_override = None
for counter in itertools.count(1):
response = self._download_json(
self._PLAYLIST_URL % (channel_id, offset, limit),
- channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter))
+ channel_id,
+ 'Downloading %s videos JSON page %s'
+ % (self._PLAYLIST_TYPE, counter_override or counter))
page_entries = self._extract_playlist_page(response)
if not page_entries:
break
+ total = int_or_none(response.get('_total'))
+ # Since the beginning of March 2016 twitch's paging mechanism
+ # is completely broken on the twitch side. It simply ignores
+ # a limit and returns the whole offset number of videos.
+ # Working around by just requesting all videos at once.
+ # Upd: pagination bug was fixed by twitch on 15.03.2016.
+ if not broken_paging_detected and total and len(page_entries) > limit:
+ self.report_warning(
+ 'Twitch pagination is broken on twitch side, requesting all videos at once',
+ channel_id)
+ broken_paging_detected = True
+ offset = total
+ counter_override = '(all at once)'
+ continue
entries.extend(page_entries)
+ if broken_paging_detected or total and len(page_entries) >= total:
+ break
offset += limit
return self.playlist_result(
- [self.url_result(entry) for entry in set(entries)],
+ [self.url_result(entry) for entry in orderedSet(entries)],
channel_id, channel_name)
def _extract_playlist_page(self, response):
@@ -333,31 +358,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):
}
-class TwitchBookmarksIE(TwitchPlaylistBaseIE):
- IE_NAME = 'twitch:bookmarks'
- _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
- _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE
- _PLAYLIST_TYPE = 'bookmarks'
-
- _TEST = {
- 'url': 'http://www.twitch.tv/ognos/profile/bookmarks',
- 'info_dict': {
- 'id': 'ognos',
- 'title': 'Ognos',
- },
- 'playlist_mincount': 3,
- }
-
- def _extract_playlist_page(self, response):
- entries = []
- for bookmark in response.get('bookmarks', []):
- video = bookmark.get('video')
- if not video:
- continue
- entries.append(video['url'])
- return entries
-
-
class TwitchStreamIE(TwitchBaseIE):
IE_NAME = 'twitch:stream'
_VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE
@@ -411,6 +411,7 @@ class TwitchStreamIE(TwitchBaseIE):
query = {
'allow_source': 'true',
+ 'allow_audio_only': 'true',
'p': random.randint(1000000, 10000000),
'player': 'twitchweb',
'segment_preference': '4',
@@ -419,7 +420,7 @@ class TwitchStreamIE(TwitchBaseIE):
}
formats = self._extract_m3u8_formats(
'%s/api/channel/hls/%s.m3u8?%s'
- % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),
+ % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)),
channel_id, 'mp4')
self._prefer_source(formats)
@@ -454,3 +455,45 @@ class TwitchStreamIE(TwitchBaseIE):
'formats': formats,
'is_live': True,
}
+
+
+class TwitchClipsIE(InfoExtractor):
+ IE_NAME = 'twitch:clips'
+ _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+
+ _TEST = {
+ 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound',
+ 'md5': '761769e1eafce0ffebfb4089cb3847cd',
+ 'info_dict': {
+ 'id': 'AggressiveCobraPoooound',
+ 'ext': 'mp4',
+ 'title': 'EA Play 2016 Live from the Novo Theatre',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'creator': 'EA',
+ 'uploader': 'stereotype_',
+ 'uploader_id': 'stereotype_',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ clip = self._parse_json(
+ self._search_regex(
+ r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'),
+ video_id, transform_source=js_to_json)
+
+ video_url = clip['clip_video_url']
+ title = clip['channel_title']
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'),
+ 'uploader': clip.get('curator_login'),
+ 'uploader_id': clip.get('curator_display_name'),
+ }
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index 5d2b5ec..b738429 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -5,12 +5,12 @@ import re
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
float_or_none,
xpath_text,
remove_end,
int_or_none,
ExtractorError,
- sanitized_Request,
)
@@ -22,7 +22,7 @@ class TwitterBaseIE(InfoExtractor):
class TwitterCardIE(TwitterBaseIE):
IE_NAME = 'twitter:card'
- _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
@@ -30,7 +30,7 @@ class TwitterCardIE(TwitterBaseIE):
'info_dict': {
'id': '560070183650213889',
'ext': 'mp4',
- 'title': 'TwitterCard',
+ 'title': 'Twitter Card',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 30.033,
}
@@ -41,7 +41,7 @@ class TwitterCardIE(TwitterBaseIE):
'info_dict': {
'id': '623160978427936768',
'ext': 'mp4',
- 'title': 'TwitterCard',
+ 'title': 'Twitter Card',
'thumbnail': 're:^https?://.*\.jpg',
'duration': 80.155,
},
@@ -53,7 +53,7 @@ class TwitterCardIE(TwitterBaseIE):
'id': 'dq4Oj5quskI',
'ext': 'mp4',
'title': 'Ubuntu 11.10 Overview',
- 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+ 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10...',
'upload_date': '20111013',
'uploader': 'OMG! Ubuntu!',
'uploader_id': 'omgubuntu',
@@ -72,63 +72,107 @@ class TwitterCardIE(TwitterBaseIE):
'title': 'Vine by ArsenalTerje',
},
'add_ie': ['Vine'],
- }
+ }, {
+ 'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
+ 'md5': '3846d0a07109b5ab622425449b59049d',
+ 'info_dict': {
+ 'id': '705235433198714880',
+ 'ext': 'mp4',
+ 'title': 'Twitter web player',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ },
+ },
]
def _real_extract(self, url):
video_id = self._match_id(url)
- # Different formats served for different User-Agents
- USER_AGENTS = [
- 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4
- 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm
- ]
-
config = None
formats = []
- for user_agent in USER_AGENTS:
- request = sanitized_Request(url)
- request.add_header('User-Agent', user_agent)
- webpage = self._download_webpage(request, video_id)
-
- iframe_url = self._html_search_regex(
- r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
- webpage, 'video iframe', default=None)
- if iframe_url:
- return self.url_result(iframe_url)
-
- config = self._parse_json(self._html_search_regex(
- r'data-player-config="([^"]+)"', webpage, 'data player config'),
- video_id)
- if 'playlist' not in config:
- if 'vmapUrl' in config:
- formats.append({
- 'url': self._get_vmap_video_url(config['vmapUrl'], video_id),
- })
- break # same video regardless of UA
- continue
-
- video_url = config['playlist'][0]['source']
-
- f = {
- 'url': video_url,
- }
+ duration = None
+ webpage = self._download_webpage(url, video_id)
+
+ iframe_url = self._html_search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
+ webpage, 'video iframe', default=None)
+ if iframe_url:
+ return self.url_result(iframe_url)
+
+ config = self._parse_json(self._html_search_regex(
+ r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'),
+ video_id)
+
+ if config.get('source_type') == 'vine':
+ return self.url_result(config['player_url'], 'Vine')
+
+ def _search_dimensions_in_video_url(a_format, video_url):
m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
if m:
- f.update({
+ a_format.update({
'width': int(m.group('width')),
'height': int(m.group('height')),
})
- formats.append(f)
+
+ video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
+
+ if video_url:
+ if determine_ext(video_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls'))
+ else:
+ f = {
+ 'url': video_url,
+ }
+
+ _search_dimensions_in_video_url(f, video_url)
+
+ formats.append(f)
+
+ vmap_url = config.get('vmapUrl') or config.get('vmap_url')
+ if vmap_url:
+ formats.append({
+ 'url': self._get_vmap_video_url(vmap_url, video_id),
+ })
+
+ media_info = None
+
+ for entity in config.get('status', {}).get('entities', []):
+ if 'mediaInfo' in entity:
+ media_info = entity['mediaInfo']
+
+ if media_info:
+ for media_variant in media_info['variants']:
+ media_url = media_variant['url']
+ if media_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
+ elif media_url.endswith('.mpd'):
+ formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
+ else:
+ vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
+ a_format = {
+ 'url': media_url,
+ 'format_id': 'http-%d' % vbr if vbr else 'http',
+ 'vbr': vbr,
+ }
+ # Reported bitRate may be zero
+ if not a_format['vbr']:
+ del a_format['vbr']
+
+ _search_dimensions_in_video_url(a_format, media_url)
+
+ formats.append(a_format)
+
+ duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
+
self._sort_formats(formats)
- thumbnail = config.get('posterImageUrl')
- duration = float_or_none(config.get('duration'))
+ title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
+ thumbnail = config.get('posterImageUrl') or config.get('image_src')
+ duration = float_or_none(config.get('duration')) or duration
return {
'id': video_id,
- 'title': 'TwitterCard',
+ 'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats,
@@ -142,17 +186,18 @@ class TwitterIE(InfoExtractor):
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
- # MD5 checksums are different in different places
'info_dict': {
'id': '643211948184596480',
'ext': 'mp4',
'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
'thumbnail': 're:^https?://.*\.jpg',
- 'duration': 12.922,
'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"',
'uploader': 'FREE THE NIPPLE',
'uploader_id': 'freethenipple',
},
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
'md5': 'f36dcd5fb92bf7057f155e7d927eeb42',
@@ -166,6 +211,7 @@ class TwitterIE(InfoExtractor):
'uploader_id': 'giphz',
},
'expected_warnings': ['height', 'width'],
+ 'skip': 'Account suspended',
}, {
'url': 'https://twitter.com/starwars/status/665052190608723968',
'md5': '39b7199856dee6cd4432e72c74bc69d4',
@@ -177,6 +223,61 @@ class TwitterIE(InfoExtractor):
'uploader_id': 'starwars',
'uploader': 'Star Wars',
},
+ }, {
+ 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
+ 'info_dict': {
+ 'id': '705235433198714880',
+ 'ext': 'mp4',
+ 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.',
+ 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."',
+ 'uploader_id': 'BTNBrentYarina',
+ 'uploader': 'Brent Yarina',
+ },
+ 'params': {
+ # The same video as https://twitter.com/i/videos/tweet/705235433198714880
+ # Test case of TwitterCardIE
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
+ 'md5': '',
+ 'info_dict': {
+ 'id': '700207533655363584',
+ 'ext': 'mp4',
+ 'title': 'Donte The Dumbass - BEAT PROD: @suhmeduh #Damndaniel',
+ 'description': 'Donte The Dumbass on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+ 'thumbnail': 're:^https?://.*\.jpg',
+ 'uploader': 'Donte The Dumbass',
+ 'uploader_id': 'jaydingeer',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
+ }, {
+ 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',
+ 'md5': '89a15ed345d13b86e9a5a5e051fa308a',
+ 'info_dict': {
+ 'id': 'MIOxnrUteUd',
+ 'ext': 'mp4',
+ 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン',
+ 'uploader': 'TAKUMA',
+ 'uploader_id': '1004126642786242560',
+ 'upload_date': '20140615',
+ },
+ 'add_ie': ['Vine'],
+ }, {
+ 'url': 'https://twitter.com/captainamerica/status/719944021058060289',
+ 'info_dict': {
+ 'id': '719944021058060289',
+ 'ext': 'mp4',
+ 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.',
+ 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"',
+ 'uploader_id': 'captainamerica',
+ 'uploader': 'Captain America',
+ },
+ 'params': {
+ 'skip_download': True, # requires ffmpeg
+ },
}]
def _real_extract(self, url):
@@ -184,7 +285,11 @@ class TwitterIE(InfoExtractor):
user_id = mobj.group('user_id')
twid = mobj.group('id')
- webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+ webpage, urlh = self._download_webpage_handle(
+ self._TEMPLATE_URL % (user_id, twid), twid)
+
+ if 'twitter.com/account/suspended' in urlh.geturl():
+ raise ExtractorError('Account suspended by Twitter.', expected=True)
username = remove_end(self._og_search_title(webpage), ' on Twitter')
@@ -201,17 +306,6 @@ class TwitterIE(InfoExtractor):
'title': username + ' - ' + title,
}
- card_id = self._search_regex(
- r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None)
- if card_id:
- card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id
- info.update({
- '_type': 'url_transparent',
- 'ie_key': 'TwitterCard',
- 'url': card_url,
- })
- return info
-
mobj = re.search(r'''(?x)
<video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
<source[^>]+video-src="(?P<url>[^"]+)"
@@ -234,6 +328,15 @@ class TwitterIE(InfoExtractor):
})
return info
+ if 'class="PlayableMedia' in webpage:
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': 'TwitterCard',
+ 'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid),
+ })
+
+ return info
+
raise ExtractorError('There\'s no video in this tweet.')
diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py
deleted file mode 100644
index d502377..0000000
--- a/youtube_dl/extractor/ubu.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- qualities,
-)
-
-
-class UbuIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html'
- _TEST = {
- 'url': 'http://ubu.com/film/her_noise.html',
- 'md5': '138d5652618bf0f03878978db9bef1ee',
- 'info_dict': {
- 'id': 'her_noise',
- 'ext': 'm4v',
- 'title': 'Her Noise - The Making Of (2007)',
- 'duration': 3600,
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- title = self._html_search_regex(
- r'<title>.+?Film &amp; Video: ([^<]+)</title>', webpage, 'title')
-
- duration = int_or_none(self._html_search_regex(
- r'Duration: (\d+) minutes', webpage, 'duration', fatal=False),
- invscale=60)
-
- formats = []
- FORMAT_REGEXES = [
- ('sq', r"'flashvars'\s*,\s*'file=([^']+)'"),
- ('hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"'),
- ]
- preference = qualities([fid for fid, _ in FORMAT_REGEXES])
- for format_id, format_regex in FORMAT_REGEXES:
- m = re.search(format_regex, webpage)
- if m:
- formats.append({
- 'url': m.group(1),
- 'format_id': format_id,
- 'preference': preference(format_id),
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'duration': duration,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index f5b5e7f..89b8695 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -1,23 +1,37 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
- compat_urllib_parse,
compat_urllib_request,
+ compat_urlparse,
)
from ..utils import (
+ determine_ext,
+ extract_attributes,
ExtractorError,
float_or_none,
int_or_none,
sanitized_Request,
unescapeHTML,
+ urlencode_postdata,
)
class UdemyIE(InfoExtractor):
IE_NAME = 'udemy'
- _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ www\.udemy\.com/
+ (?:
+ [^#]+\#/lecture/|
+ lecture/view/?\?lectureId=|
+ [^/]+/learn/v4/t/lecture/
+ )
+ (?P<id>\d+)
+ '''
_LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1'
_ORIGIN_URL = 'https://www.udemy.com'
_NETRC_MACHINE = 'udemy'
@@ -33,36 +47,55 @@ class UdemyIE(InfoExtractor):
'duration': 579.29,
},
'skip': 'Requires udemy account credentials',
+ }, {
+ # new URL schema
+ 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906',
+ 'only_matching': True,
}]
- def _enroll_course(self, webpage, course_id):
+ def _extract_course_info(self, webpage, video_id):
+ course = self._parse_json(
+ unescapeHTML(self._search_regex(
+ r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')),
+ video_id, fatal=False) or {}
+ course_id = course.get('id') or self._search_regex(
+ (r'&quot;id&quot;\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'),
+ webpage, 'course id')
+ return course_id, course.get('title')
+
+ def _enroll_course(self, base_url, webpage, course_id):
+ def combine_url(base_url, url):
+ return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url
+
checkout_url = unescapeHTML(self._search_regex(
- r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1',
+ r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1',
webpage, 'checkout url', group='url', default=None))
if checkout_url:
raise ExtractorError(
'Course %s is not free. You have to pay for it before you can download. '
- 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True)
+ 'Use this URL to confirm purchase: %s'
+ % (course_id, combine_url(base_url, checkout_url)),
+ expected=True)
enroll_url = unescapeHTML(self._search_regex(
- r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1',
+ r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1',
webpage, 'enroll url', group='url', default=None))
if enroll_url:
- webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course')
+ webpage = self._download_webpage(
+ combine_url(base_url, enroll_url),
+ course_id, 'Enrolling in the course',
+ headers={'Referer': base_url})
if '>You have enrolled in' in webpage:
self.to_screen('%s: Successfully enrolled in the course' % course_id)
def _download_lecture(self, course_id, lecture_id):
return self._download_json(
- 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % (
- course_id, lecture_id, compat_urllib_parse.urlencode({
- 'video_only': '',
- 'auto_play': '',
- 'fields[lecture]': 'title,description,asset',
- 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
- 'instructorPreviewMode': 'False',
- })),
- lecture_id, 'Downloading lecture JSON')
+ 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
+ % (course_id, lecture_id),
+ lecture_id, 'Downloading lecture JSON', query={
+ 'fields[lecture]': 'title,description,view_html,asset',
+ 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
+ })
def _handle_error(self, response):
if not isinstance(response, dict):
@@ -75,7 +108,7 @@ class UdemyIE(InfoExtractor):
error_str += ' - %s' % error_data.get('formErrors')
raise ExtractorError(error_str, expected=True)
- def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'):
+ def _download_json(self, url_or_request, *args, **kwargs):
headers = {
'X-Udemy-Snail-Case': 'true',
'X-Requested-With': 'XMLHttpRequest',
@@ -93,7 +126,7 @@ class UdemyIE(InfoExtractor):
else:
url_or_request = sanitized_Request(url_or_request, headers=headers)
- response = super(UdemyIE, self)._download_json(url_or_request, video_id, note)
+ response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs)
self._handle_error(response)
return response
@@ -109,7 +142,9 @@ class UdemyIE(InfoExtractor):
self._LOGIN_URL, None, 'Downloading login popup')
def is_logged(webpage):
- return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<'])
+ return any(re.search(p, webpage) for p in (
+ r'href=["\'](?:https://www\.udemy\.com)?/user/logout/',
+ r'>Logout<'))
# already logged in
if is_logged(login_popup):
@@ -118,17 +153,17 @@ class UdemyIE(InfoExtractor):
login_form = self._form_hidden_inputs('login-form', login_popup)
login_form.update({
- 'email': username.encode('utf-8'),
- 'password': password.encode('utf-8'),
+ 'email': username,
+ 'password': password,
})
- request = sanitized_Request(
- self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
- request.add_header('Referer', self._ORIGIN_URL)
- request.add_header('Origin', self._ORIGIN_URL)
-
response = self._download_webpage(
- request, None, 'Logging in as %s' % username)
+ self._LOGIN_URL, None, 'Logging in as %s' % username,
+ data=urlencode_postdata(login_form),
+ headers={
+ 'Referer': self._ORIGIN_URL,
+ 'Origin': self._ORIGIN_URL,
+ })
if not is_logged(response):
error = self._html_search_regex(
@@ -143,15 +178,14 @@ class UdemyIE(InfoExtractor):
webpage = self._download_webpage(url, lecture_id)
- course_id = self._search_regex(
- r'data-course-id=["\'](\d+)', webpage, 'course id')
+ course_id, _ = self._extract_course_info(webpage, lecture_id)
try:
lecture = self._download_lecture(course_id, lecture_id)
except ExtractorError as e:
# Error could possibly mean we are not enrolled in the course
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- self._enroll_course(webpage, course_id)
+ self._enroll_course(url, webpage, course_id)
lecture = self._download_lecture(course_id, lecture_id)
else:
raise
@@ -161,12 +195,12 @@ class UdemyIE(InfoExtractor):
asset = lecture['asset']
- asset_type = asset.get('assetType') or asset.get('asset_type')
+ asset_type = asset.get('asset_type') or asset.get('assetType')
if asset_type != 'Video':
raise ExtractorError(
'Lecture %s is not a video' % lecture_id, expected=True)
- stream_url = asset.get('streamUrl') or asset.get('stream_url')
+ stream_url = asset.get('stream_url') or asset.get('streamUrl')
if stream_url:
youtube_url = self._search_regex(
r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None)
@@ -174,43 +208,92 @@ class UdemyIE(InfoExtractor):
return self.url_result(youtube_url, 'Youtube')
video_id = asset['id']
- thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url')
+ thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl')
duration = float_or_none(asset.get('data', {}).get('duration'))
- outputs = asset.get('data', {}).get('outputs', {})
formats = []
- for format_ in asset.get('download_urls', {}).get('Video', []):
- video_url = format_.get('file')
- if not video_url:
- continue
- format_id = format_.get('label')
- f = {
- 'url': format_['file'],
- 'height': int_or_none(format_id),
+
+ def extract_output_format(src):
+ return {
+ 'url': src['url'],
+ 'format_id': '%sp' % (src.get('height') or format_id),
+ 'width': int_or_none(src.get('width')),
+ 'height': int_or_none(src.get('height')),
+ 'vbr': int_or_none(src.get('video_bitrate_in_kbps')),
+ 'vcodec': src.get('video_codec'),
+ 'fps': int_or_none(src.get('frame_rate')),
+ 'abr': int_or_none(src.get('audio_bitrate_in_kbps')),
+ 'acodec': src.get('audio_codec'),
+ 'asr': int_or_none(src.get('audio_sample_rate')),
+ 'tbr': int_or_none(src.get('total_bitrate_in_kbps')),
+ 'filesize': int_or_none(src.get('file_size_in_bytes')),
}
- if format_id:
- # Some videos contain additional metadata (e.g.
- # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
- output = outputs.get(format_id)
- if isinstance(output, dict):
- f.update({
- 'format_id': '%sp' % (output.get('label') or format_id),
- 'width': int_or_none(output.get('width')),
- 'height': int_or_none(output.get('height')),
- 'vbr': int_or_none(output.get('video_bitrate_in_kbps')),
- 'vcodec': output.get('video_codec'),
- 'fps': int_or_none(output.get('frame_rate')),
- 'abr': int_or_none(output.get('audio_bitrate_in_kbps')),
- 'acodec': output.get('audio_codec'),
- 'asr': int_or_none(output.get('audio_sample_rate')),
- 'tbr': int_or_none(output.get('total_bitrate_in_kbps')),
- 'filesize': int_or_none(output.get('file_size_in_bytes')),
- })
+
+ outputs = asset.get('data', {}).get('outputs')
+ if not isinstance(outputs, dict):
+ outputs = {}
+
+ def add_output_format_meta(f, key):
+ output = outputs.get(key)
+ if isinstance(output, dict):
+ output_format = extract_output_format(output)
+ output_format.update(f)
+ return output_format
+ return f
+
+ download_urls = asset.get('download_urls')
+ if isinstance(download_urls, dict):
+ video = download_urls.get('Video')
+ if isinstance(video, list):
+ for format_ in video:
+ video_url = format_.get('file')
+ if not video_url:
+ continue
+ format_id = format_.get('label')
+ f = {
+ 'url': format_['file'],
+ 'format_id': '%sp' % format_id,
+ 'height': int_or_none(format_id),
+ }
+ if format_id:
+ # Some videos contain additional metadata (e.g.
+ # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
+ f = add_output_format_meta(f, format_id)
+ formats.append(f)
+
+ view_html = lecture.get('view_html')
+ if view_html:
+ view_html_urls = set()
+ for source in re.findall(r'<source[^>]+>', view_html):
+ attributes = extract_attributes(source)
+ src = attributes.get('src')
+ if not src:
+ continue
+ res = attributes.get('data-res')
+ height = int_or_none(res)
+ if src in view_html_urls:
+ continue
+ view_html_urls.add(src)
+ if attributes.get('type') == 'application/x-mpegURL' or determine_ext(src) == 'm3u8':
+ m3u8_formats = self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+ for f in m3u8_formats:
+ m = re.search(r'/hls_(?P<height>\d{3,4})_(?P<tbr>\d{2,})/', f['url'])
+ if m:
+ if not f.get('height'):
+ f['height'] = int(m.group('height'))
+ if not f.get('tbr'):
+ f['tbr'] = int(m.group('tbr'))
+ formats.extend(m3u8_formats)
else:
- f['format_id'] = '%sp' % format_id
- formats.append(f)
+ formats.append(add_output_format_meta({
+ 'url': src,
+ 'format_id': '%dp' % height if height else None,
+ 'height': height,
+ }, res))
- self._sort_formats(formats)
+ self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
return {
'id': video_id,
@@ -224,7 +307,7 @@ class UdemyIE(InfoExtractor):
class UdemyCourseIE(UdemyIE):
IE_NAME = 'udemy:course'
- _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[\da-z-]+)'
+ _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = []
@classmethod
@@ -236,29 +319,34 @@ class UdemyCourseIE(UdemyIE):
webpage = self._download_webpage(url, course_path)
- response = self._download_json(
- 'https://www.udemy.com/api-1.1/courses/%s' % course_path,
- course_path, 'Downloading course JSON')
-
- course_id = response['id']
- course_title = response.get('title')
+ course_id, title = self._extract_course_info(webpage, course_path)
- self._enroll_course(webpage, course_id)
+ self._enroll_course(url, webpage, course_id)
response = self._download_json(
- 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
- course_id, 'Downloading course curriculum')
+ 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id,
+ course_id, 'Downloading course curriculum', query={
+ 'fields[chapter]': 'title,object_index',
+ 'fields[lecture]': 'title,asset',
+ 'page_size': '1000',
+ })
entries = []
- chapter, chapter_number = None, None
- for asset in response:
- asset_type = asset.get('assetType') or asset.get('asset_type')
- if asset_type == 'Video':
- asset_id = asset.get('id')
- if asset_id:
+ chapter, chapter_number = [None] * 2
+ for entry in response['results']:
+ clazz = entry.get('_class')
+ if clazz == 'lecture':
+ asset = entry.get('asset')
+ if isinstance(asset, dict):
+ asset_type = asset.get('asset_type') or asset.get('assetType')
+ if asset_type != 'Video':
+ continue
+ lecture_id = entry.get('id')
+ if lecture_id:
entry = {
'_type': 'url_transparent',
- 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']),
+ 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']),
+ 'title': entry.get('title'),
'ie_key': UdemyIE.ie_key(),
}
if chapter_number:
@@ -266,8 +354,8 @@ class UdemyCourseIE(UdemyIE):
if chapter:
entry['chapter'] = chapter
entries.append(entry)
- elif asset.get('type') == 'chapter':
- chapter_number = asset.get('index') or asset.get('object_index')
- chapter = asset.get('title')
+ elif clazz == 'chapter':
+ chapter_number = entry.get('object_index')
+ chapter = entry.get('title')
- return self.playlist_result(entries, course_id, course_title)
+ return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py
index ee35b72..57dd73a 100644
--- a/youtube_dl/extractor/udn.py
+++ b/youtube_dl/extractor/udn.py
@@ -2,10 +2,13 @@
from __future__ import unicode_literals
import json
+import re
+
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
+ int_or_none,
js_to_json,
- ExtractorError,
)
from ..compat import compat_urlparse
@@ -16,13 +19,16 @@ class UDNEmbedIE(InfoExtractor):
_VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL
_TESTS = [{
'url': 'http://video.udn.com/embed/news/300040',
- 'md5': 'de06b4c90b042c128395a88f0384817e',
'info_dict': {
'id': '300040',
'ext': 'mp4',
'title': '生物老師男變女 全校挺"做自己"',
'thumbnail': 're:^https?://.*\.jpg$',
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
'url': 'https://video.udn.com/embed/news/300040',
'only_matching': True,
@@ -38,39 +44,53 @@ class UDNEmbedIE(InfoExtractor):
page = self._download_webpage(url, video_id)
options = json.loads(js_to_json(self._html_search_regex(
- r'var options\s*=\s*([^;]+);', page, 'video urls dictionary')))
+ r'var\s+options\s*=\s*([^;]+);', page, 'video urls dictionary')))
video_urls = options['video']
if video_urls.get('youtube'):
return self.url_result(video_urls.get('youtube'), 'Youtube')
- try:
- del video_urls['youtube']
- except KeyError:
- pass
+ formats = []
+ for video_type, api_url in video_urls.items():
+ if not api_url:
+ continue
- formats = [{
- 'url': self._download_webpage(
+ video_url = self._download_webpage(
compat_urlparse.urljoin(url, api_url), video_id,
- 'retrieve url for %s video' % video_type),
- 'format_id': video_type,
- 'preference': 0 if video_type == 'mp4' else -1,
- } for video_type, api_url in video_urls.items() if api_url]
+ note='retrieve url for %s video' % video_type)
- if not formats:
- raise ExtractorError('No videos found', expected=True)
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, ext='mp4', m3u8_id='hls'))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id='hds'))
+ else:
+ mobj = re.search(r'_(?P<height>\d+)p_(?P<tbr>\d+).mp4', video_url)
+ a_format = {
+ 'url': video_url,
+ # video_type may be 'mp4', which confuses YoutubeDL
+ 'format_id': 'http-' + video_type,
+ }
+ if mobj:
+ a_format.update({
+ 'height': int_or_none(mobj.group('height')),
+ 'tbr': int_or_none(mobj.group('tbr')),
+ })
+ formats.append(a_format)
self._sort_formats(formats)
- thumbnail = None
-
- if options.get('gallery') and len(options['gallery']):
- thumbnail = options['gallery'][0].get('original')
+ thumbnails = [{
+ 'url': img_url,
+ 'id': img_type,
+ } for img_type, img_url in options.get('gallery', [{}])[0].items() if img_url]
return {
'id': video_id,
'formats': formats,
'title': options['title'],
- 'thumbnail': thumbnail
+ 'thumbnails': thumbnails,
}
diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py
index 594bee4..a724cdb 100644
--- a/youtube_dl/extractor/unistra.py
+++ b/youtube_dl/extractor/unistra.py
@@ -7,7 +7,7 @@ from ..utils import qualities
class UnistraIE(InfoExtractor):
- _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)'
+ _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)'
_TESTS = [
{
@@ -49,6 +49,7 @@ class UnistraIE(InfoExtractor):
'format_id': format_id,
'quality': quality(format_id)
})
+ self._sort_formats(formats)
title = self._html_search_regex(
r'<title>UTV - (.*?)</', webpage, 'title')
diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py
new file mode 100644
index 0000000..e5678dc
--- /dev/null
+++ b/youtube_dl/extractor/usatoday.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ get_element_by_attribute,
+ parse_duration,
+ update_url_query,
+ ExtractorError,
+)
+from ..compat import compat_str
+
+
+class USATodayIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)'
+ _TEST = {
+ 'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/',
+ 'md5': '4d40974481fa3475f8bccfd20c5361f8',
+ 'info_dict': {
+ 'id': '81729424',
+ 'ext': 'mp4',
+ 'title': 'US, France warn Syrian regime ahead of new peace talks',
+ 'timestamp': 1457891045,
+ 'description': 'md5:7e50464fdf2126b0f533748d3c78d58f',
+ 'uploader_id': '29906170001',
+ 'upload_date': '20160313',
+ }
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(update_url_query(url, {'ajax': 'true'}), display_id)
+ ui_video_data = get_element_by_attribute('class', 'ui-video-data', webpage)
+ if not ui_video_data:
+ raise ExtractorError('no video on the webpage', expected=True)
+ video_data = self._parse_json(ui_video_data, display_id)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'],
+ 'id': compat_str(video_data['id']),
+ 'title': video_data['title'],
+ 'thumbnail': video_data.get('thumbnail'),
+ 'description': video_data.get('description'),
+ 'duration': parse_duration(video_data.get('length')),
+ 'ie_key': 'BrightcoveNew',
+ }
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index b5fe753..54605d8 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -41,6 +41,12 @@ class UstreamIE(InfoExtractor):
'uploader': 'sportscanadatv',
},
'skip': 'This Pro Broadcaster has chosen to remove this video from the ustream.tv site.',
+ }, {
+ 'url': 'http://www.ustream.tv/embed/10299409',
+ 'info_dict': {
+ 'id': '10299409',
+ },
+ 'playlist_count': 3,
}]
def _real_extract(self, url):
@@ -55,10 +61,12 @@ class UstreamIE(InfoExtractor):
if m.group('type') == 'embed':
video_id = m.group('id')
webpage = self._download_webpage(url, video_id)
- desktop_video_id = self._html_search_regex(
- r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id')
- desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id
- return self.url_result(desktop_url, 'Ustream')
+ content_video_ids = self._parse_json(self._search_regex(
+ r'ustream\.vars\.offAirContentVideoIds=([^;]+);', webpage,
+ 'content video IDs'), video_id)
+ return self.playlist_result(
+ map(lambda u: self.url_result('http://www.ustream.tv/recorded/' + u, 'Ustream'), content_video_ids),
+ video_id)
params = self._download_json(
'https://api.ustream.tv/videos/%s.json' % video_id, video_id)
diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py
new file mode 100644
index 0000000..3484a20
--- /dev/null
+++ b/youtube_dl/extractor/ustudio.py
@@ -0,0 +1,125 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+ unescapeHTML,
+)
+
+
+class UstudioIE(InfoExtractor):
+ IE_NAME = 'ustudio'
+ _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge',
+ 'md5': '58bbfca62125378742df01fc2abbdef6',
+ 'info_dict': {
+ 'id': 'Uxu2my9bgSph',
+ 'display_id': 'san_francisco_golden_gate_bridge',
+ 'ext': 'mp4',
+ 'title': 'San Francisco: Golden Gate Bridge',
+ 'description': 'md5:23925500697f2c6d4830e387ba51a9be',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'upload_date': '20111107',
+ 'uploader': 'Tony Farley',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id, display_id = re.match(self._VALID_URL, url).groups()
+
+ config = self._download_xml(
+ 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id,
+ display_id)
+
+ def extract(kind):
+ return [{
+ 'url': unescapeHTML(item.attrib['url']),
+ 'width': int_or_none(item.get('width')),
+ 'height': int_or_none(item.get('height')),
+ } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')]
+
+ formats = extract('video')
+ self._sort_formats(formats)
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+ upload_date = unified_strdate(self._search_regex(
+ r'(?s)Uploaded by\s*.+?\s*on\s*<span>([^<]+)</span>',
+ webpage, 'upload date', fatal=False))
+ uploader = self._search_regex(
+ r'Uploaded by\s*<a[^>]*>([^<]+)<',
+ webpage, 'uploader', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnails': extract('image'),
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'formats': formats,
+ }
+
+
+class UstudioEmbedIE(InfoExtractor):
+ IE_NAME = 'ustudio:embed'
+ _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T',
+ 'md5': '47c0be52a09b23a7f40de9469cec58f4',
+ 'info_dict': {
+ 'id': 'Uw7G1kMCe65T',
+ 'ext': 'mp4',
+ 'title': '5 Things IT Should Know About Video',
+ 'description': 'md5:93d32650884b500115e158c5677d25ad',
+ 'uploader_id': 'DeN7VdYRDKhP',
+ }
+ }
+
+ def _real_extract(self, url):
+ uploader_id, video_id = re.match(self._VALID_URL, url).groups()
+ video_data = self._download_json(
+ 'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id),
+ video_id)['videos'][0]
+ title = video_data['name']
+
+ formats = []
+ for ext, qualities in video_data.get('transcodes', {}).items():
+ for quality in qualities:
+ quality_url = quality.get('url')
+ if not quality_url:
+ continue
+ height = int_or_none(quality.get('height'))
+ formats.append({
+ 'format_id': '%s-%dp' % (ext, height) if height else ext,
+ 'url': quality_url,
+ 'width': int_or_none(quality.get('width')),
+ 'height': height,
+ })
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for image in video_data.get('images', []):
+ image_url = image.get('url')
+ if not image_url:
+ continue
+ thumbnails.append({
+ 'url': image_url,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'uploader_id': uploader_id,
+ 'tags': video_data.get('keywords'),
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py
index 9369aba..8469837 100644
--- a/youtube_dl/extractor/varzesh3.py
+++ b/youtube_dl/extractor/varzesh3.py
@@ -2,11 +2,19 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlparse,
+ compat_parse_qs,
+)
+from ..utils import (
+ clean_html,
+ remove_start,
+)
class Varzesh3IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?'
- _TEST = {
+ _TESTS = [{
'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/',
'md5': '2a933874cb7dce4366075281eb49e855',
'info_dict': {
@@ -15,8 +23,19 @@ class Varzesh3IE(InfoExtractor):
'title': '۵ واکنش برتر دروازه‌بانان؛هفته ۲۶ بوندسلیگا',
'description': 'فصل ۲۰۱۵-۲۰۱۴',
'thumbnail': 're:^https?://.*\.jpg$',
- }
- }
+ },
+ 'skip': 'HTTP 404 Error',
+ }, {
+ 'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87',
+ 'md5': '841b7cd3afbc76e61708d94e53a4a4e7',
+ 'info_dict': {
+ 'id': '112785',
+ 'ext': 'mp4',
+ 'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره',
+ 'description': 'فوتبال 120',
+ },
+ 'expected_warnings': ['description'],
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -26,15 +45,30 @@ class Varzesh3IE(InfoExtractor):
video_url = self._search_regex(
r'<source[^>]+src="([^"]+)"', webpage, 'video url')
- title = self._og_search_title(webpage)
+ title = remove_start(self._html_search_regex(
+ r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ')
+
description = self._html_search_regex(
r'(?s)<div class="matn">(.+?)</div>',
- webpage, 'description', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
+ webpage, 'description', default=None)
+ if description is None:
+ description = clean_html(self._html_search_meta('description', webpage))
+
+ thumbnail = self._og_search_thumbnail(webpage, default=None)
+ if thumbnail is None:
+ fb_sharer_url = self._search_regex(
+ r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"',
+ webpage, 'facebook sharer URL', fatal=False)
+ sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query)
+ thumbnail = sharer_params.get('p[images][0]', [None])[0]
video_id = self._search_regex(
r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'",
- webpage, display_id, default=display_id)
+ webpage, display_id, default=None)
+ if video_id is None:
+ video_id = self._search_regex(
+ 'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id',
+ default=display_id)
return {
'url': video_url,
diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py
index 3794bcd..dff1bb7 100644
--- a/youtube_dl/extractor/vbox7.py
+++ b/youtube_dl/extractor/vbox7.py
@@ -2,18 +2,16 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
ExtractorError,
sanitized_Request,
+ urlencode_postdata,
)
class Vbox7IE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)'
_TEST = {
'url': 'http://vbox7.com/play:249bb972c2',
'md5': '99f65c0c9ef9b682b97313e052734c3f',
@@ -48,7 +46,7 @@ class Vbox7IE(InfoExtractor):
webpage, 'title').split('/')[0].strip()
info_url = 'http://vbox7.com/play/magare.do'
- data = compat_urllib_parse.urlencode({'as3': '1', 'vid': video_id})
+ data = urlencode_postdata({'as3': '1', 'vid': video_id})
info_request = sanitized_Request(info_url, data)
info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage')
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index 9633f7f..0f5d687 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -12,7 +12,7 @@ from ..utils import (
class VeohIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
+ _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
_TESTS = [
{
@@ -37,6 +37,7 @@ class VeohIE(InfoExtractor):
'uploader': 'afp-news',
'duration': 123,
},
+ 'skip': 'This video has been deleted.',
},
{
'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py
index 1a0ff33..2cd617b 100644
--- a/youtube_dl/extractor/vessel.py
+++ b/youtube_dl/extractor/vessel.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import json
+import re
from .common import InfoExtractor
from ..utils import (
@@ -12,11 +13,11 @@ from ..utils import (
class VesselIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+ _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
_API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
_LOGIN_URL = 'https://www.vessel.com/api/account/login'
_NETRC_MACHINE = 'vessel'
- _TEST = {
+ _TESTS = [{
'url': 'https://www.vessel.com/videos/HDN7G5UMs',
'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
'info_dict': {
@@ -28,7 +29,16 @@ class VesselIE(InfoExtractor):
'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
'timestamp': int,
},
- }
+ }, {
+ 'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z]+.*?)\1',
+ webpage)]
@staticmethod
def make_json_request(url, data):
@@ -98,16 +108,24 @@ class VesselIE(InfoExtractor):
formats = []
for f in video_asset.get('sources', []):
- if f['name'] == 'hls-index':
+ location = f.get('location')
+ if not location:
+ continue
+ name = f.get('name')
+ if name == 'hls-index':
formats.extend(self._extract_m3u8_formats(
- f['location'], video_id, ext='mp4', m3u8_id='m3u8'))
+ location, video_id, ext='mp4',
+ entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False))
+ elif name == 'dash-index':
+ formats.extend(self._extract_mpd_formats(
+ location, video_id, mpd_id='dash', fatal=False))
else:
formats.append({
- 'format_id': f['name'],
+ 'format_id': name,
'tbr': f.get('bitrate'),
'height': f.get('height'),
'width': f.get('width'),
- 'url': f['location'],
+ 'url': location,
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py
index a0c59a2..cb64ae0 100644
--- a/youtube_dl/extractor/vesti.py
+++ b/youtube_dl/extractor/vesti.py
@@ -10,7 +10,7 @@ from .rutv import RUTVIE
class VestiIE(InfoExtractor):
IE_DESC = 'Вести.Ru'
- _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
+ _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
_TESTS = [
{
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 152fef4..388b4de 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -3,7 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_etree_fromstring
+from ..compat import (
+ compat_etree_fromstring,
+ compat_str,
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
int_or_none,
@@ -12,13 +16,22 @@ from ..utils import (
)
-class VevoIE(InfoExtractor):
+class VevoBaseIE(InfoExtractor):
+ def _extract_json(self, webpage, video_id, item):
+ return self._parse_json(
+ self._search_regex(
+ r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>',
+ webpage, 'initial store'),
+ video_id)['default'][item]
+
+
+class VevoIE(VevoBaseIE):
'''
Accepts urls from vevo.com or in the format 'vevo:{id}'
(currently used by MTVIE and MySpaceIE)
'''
_VALID_URL = r'''(?x)
- (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|
+ (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?|
https?://cache\.vevo\.com/m/html/embed\.html\?video=|
https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
vevo:)
@@ -30,11 +43,15 @@ class VevoIE(InfoExtractor):
'info_dict': {
'id': 'GB1101300280',
'ext': 'mp4',
- 'title': 'Somebody to Die For',
+ 'title': 'Hurts - Somebody to Die For',
+ 'timestamp': 1372057200,
'upload_date': '20130624',
'uploader': 'Hurts',
- 'timestamp': 1372057200,
+ 'track': 'Somebody to Die For',
+ 'artist': 'Hurts',
+ 'genre': 'Pop',
},
+ 'expected_warnings': ['Unable to download SMIL file'],
}, {
'note': 'v3 SMIL format',
'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
@@ -42,23 +59,31 @@ class VevoIE(InfoExtractor):
'info_dict': {
'id': 'USUV71302923',
'ext': 'mp4',
- 'title': 'I Wish I Could Break Your Heart',
+ 'title': 'Cassadee Pope - I Wish I Could Break Your Heart',
+ 'timestamp': 1392796919,
'upload_date': '20140219',
'uploader': 'Cassadee Pope',
- 'timestamp': 1392796919,
+ 'track': 'I Wish I Could Break Your Heart',
+ 'artist': 'Cassadee Pope',
+ 'genre': 'Country',
},
+ 'expected_warnings': ['Unable to download SMIL file'],
}, {
'note': 'Age-limited video',
'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
'info_dict': {
'id': 'USRV81300282',
'ext': 'mp4',
- 'title': 'Tunnel Vision (Explicit)',
- 'upload_date': '20130703',
+ 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
'age_limit': 18,
- 'uploader': 'Justin Timberlake',
'timestamp': 1372888800,
+ 'upload_date': '20130703',
+ 'uploader': 'Justin Timberlake',
+ 'track': 'Tunnel Vision (Explicit)',
+ 'artist': 'Justin Timberlake',
+ 'genre': 'Pop',
},
+ 'expected_warnings': ['Unable to download SMIL file'],
}, {
'note': 'No video_info',
'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000',
@@ -66,12 +91,36 @@ class VevoIE(InfoExtractor):
'info_dict': {
'id': 'USUV71503000',
'ext': 'mp4',
- 'title': 'Till I Die',
- 'upload_date': '20151207',
+ 'title': 'K Camp - Till I Die',
'age_limit': 18,
- 'uploader': 'K Camp',
'timestamp': 1449468000,
+ 'upload_date': '20151207',
+ 'uploader': 'K Camp',
+ 'track': 'Till I Die',
+ 'artist': 'K Camp',
+ 'genre': 'Rap/Hip-Hop',
+ },
+ }, {
+ 'note': 'Only available via webpage',
+ 'url': 'http://www.vevo.com/watch/GBUV71600656',
+ 'md5': '67e79210613865b66a47c33baa5e37fe',
+ 'info_dict': {
+ 'id': 'GBUV71600656',
+ 'ext': 'mp4',
+ 'title': 'ABC - Viva Love',
+ 'age_limit': 0,
+ 'timestamp': 1461830400,
+ 'upload_date': '20160428',
+ 'uploader': 'ABC',
+ 'track': 'Viva Love',
+ 'artist': 'ABC',
+ 'genre': 'Pop',
},
+ 'expected_warnings': ['Failed to download video versions info'],
+ }, {
+ # no genres available
+ 'url': 'http://www.vevo.com/watch/INS171400764',
+ 'only_matching': True,
}]
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com'
_SOURCE_TYPES = {
@@ -140,42 +189,41 @@ class VevoIE(InfoExtractor):
errnote='Unable to retrieve oauth token')
if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage:
- raise ExtractorError(
- '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True)
+ self.raise_geo_restricted(
+ '%s said: This page is currently unavailable in your region' % self.IE_NAME)
auth_info = self._parse_json(webpage, video_id)
self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
- def _call_api(self, path, video_id, note, errnote, fatal=True):
- return self._download_json(self._api_url_template % path, video_id, note, errnote)
+ def _call_api(self, path, *args, **kwargs):
+ return self._download_json(self._api_url_template % path, *args, **kwargs)
def _real_extract(self, url):
video_id = self._match_id(url)
- json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
+ json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
response = self._download_json(
- json_url, video_id, 'Downloading video info', 'Unable to download info')
+ json_url, video_id, 'Downloading video info',
+ 'Unable to download info', fatal=False) or {}
video_info = response.get('video') or {}
- video_versions = video_info.get('videoVersions')
+ artist = None
+ featured_artist = None
uploader = None
- timestamp = None
view_count = None
formats = []
if not video_info:
- if response.get('statusCode') != 909:
+ try:
+ self._initialize_api(video_id)
+ except ExtractorError:
ytid = response.get('errorInfo', {}).get('ytid')
if ytid:
self.report_warning(
'Video is geoblocked, trying with the YouTube video %s' % ytid)
return self.url_result(ytid, 'Youtube', ytid)
- if 'statusMessage' in response:
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, response['statusMessage']), expected=True)
- raise ExtractorError('Unable to extract videos')
+ raise
- self._initialize_api(video_id)
video_info = self._call_api(
'video/%s' % video_id, video_id, 'Downloading api video info',
'Failed to download video info')
@@ -183,12 +231,19 @@ class VevoIE(InfoExtractor):
video_versions = self._call_api(
'video/%s/streams' % video_id, video_id,
'Downloading video versions info',
- 'Failed to download video versions info')
+ 'Failed to download video versions info',
+ fatal=False)
+
+ # Some videos are only available via webpage (e.g.
+ # https://github.com/rg3/youtube-dl/issues/9366)
+ if not video_versions:
+ webpage = self._download_webpage(url, video_id)
+ video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0]
timestamp = parse_iso8601(video_info.get('releaseDate'))
artists = video_info.get('artists')
if artists:
- uploader = artists[0]['name']
+ artist = uploader = artists[0]['name']
view_count = int_or_none(video_info.get('views', {}).get('total'))
for video_version in video_versions:
@@ -241,7 +296,11 @@ class VevoIE(InfoExtractor):
scale=1000)
artists = video_info.get('mainArtists')
if artists:
- uploader = artists[0]['artistName']
+ artist = uploader = artists[0]['artistName']
+
+ featured_artists = video_info.get('featuredArtists')
+ if featured_artists:
+ featured_artist = featured_artists[0]['artistName']
smil_parsed = False
for video_version in video_info['videoVersions']:
@@ -278,7 +337,15 @@ class VevoIE(InfoExtractor):
smil_parsed = True
self._sort_formats(formats)
- title = video_info['title']
+ track = video_info['title']
+ if featured_artist:
+ artist = '%s ft. %s' % (artist, featured_artist)
+ title = '%s - %s' % (artist, track) if artist else track
+
+ genres = video_info.get('genres')
+ genre = (
+ genres[0] if genres and isinstance(genres, list) and
+ isinstance(genres[0], compat_str) else None)
is_explicit = video_info.get('isExplicit')
if is_explicit is True:
@@ -300,4 +367,75 @@ class VevoIE(InfoExtractor):
'duration': duration,
'view_count': view_count,
'age_limit': age_limit,
+ 'track': track,
+ 'artist': uploader,
+ 'genre': genre,
}
+
+
+class VevoPlaylistIE(VevoBaseIE):
+ _VALID_URL = r'https?://www\.vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29',
+ 'info_dict': {
+ 'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29',
+ 'title': 'Best-Of: Birdman',
+ },
+ 'playlist_count': 10,
+ }, {
+ 'url': 'http://www.vevo.com/watch/genre/rock',
+ 'info_dict': {
+ 'id': 'rock',
+ 'title': 'Rock',
+ },
+ 'playlist_count': 20,
+ }, {
+ 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0',
+ 'md5': '32dcdfddddf9ec6917fc88ca26d36282',
+ 'info_dict': {
+ 'id': 'USCMV1100073',
+ 'ext': 'mp4',
+ 'title': 'Birdman - Y.U. MAD',
+ 'timestamp': 1323417600,
+ 'upload_date': '20111209',
+ 'uploader': 'Birdman',
+ 'track': 'Y.U. MAD',
+ 'artist': 'Birdman',
+ 'genre': 'Rap/Hip-Hop',
+ },
+ 'expected_warnings': ['Unable to download SMIL file'],
+ }, {
+ 'url': 'http://www.vevo.com/watch/genre/rock?index=0',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ playlist_kind = mobj.group('kind')
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ index = qs.get('index', [None])[0]
+
+ if index:
+ video_id = self._search_regex(
+ r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>',
+ webpage, 'video id', default=None, group='id')
+ if video_id:
+ return self.url_result('vevo:%s' % video_id, VevoIE.ie_key())
+
+ playlists = self._extract_json(webpage, playlist_id, '%ss' % playlist_kind)
+
+ playlist = (list(playlists.values())[0]
+ if playlist_kind == 'playlist' else playlists[playlist_id])
+
+ entries = [
+ self.url_result('vevo:%s' % src, VevoIE.ie_key())
+ for src in playlist['isrcs']]
+
+ return self.playlist_result(
+ entries, playlist.get('playlistId') or playlist_id,
+ playlist.get('name'), playlist.get('description'))
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index 14e945d..b11cd25 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -20,6 +20,7 @@ class VGTVIE(XstreamIE):
'aftenbladet.no/tv': 'satv',
'fvn.no/fvntv': 'fvntv',
'aftenposten.no/webtv': 'aptv',
+ 'ap.vgtv.no/webtv': 'aptv',
}
_APP_NAME_TO_VENDOR = {
@@ -35,7 +36,7 @@ class VGTVIE(XstreamIE):
(?P<host>
%s
)
- /
+ /?
(?:
\#!/(?:video|live)/|
embed?.*id=
@@ -107,19 +108,27 @@ class VGTVIE(XstreamIE):
'md5': 'fd828cd29774a729bf4d4425fe192972',
'info_dict': {
'id': '21039',
- 'ext': 'mov',
+ 'ext': 'mp4',
'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',
'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',
'duration': 66,
'timestamp': 1417002452,
'upload_date': '20141126',
'view_count': int,
- }
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
},
{
'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',
'only_matching': True,
},
+ {
+ 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil',
+ 'only_matching': True,
+ },
]
def _real_extract(self, url):
@@ -144,8 +153,6 @@ class VGTVIE(XstreamIE):
if len(video_id) == 5:
if appname == 'bttv':
info = self._extract_video_info('btno', video_id)
- elif appname == 'aptv':
- info = self._extract_video_info('ap', video_id)
streams = data['streamUrls']
stream_type = data.get('streamType')
@@ -207,7 +214,7 @@ class VGTVIE(XstreamIE):
class BTArticleIE(InfoExtractor):
IE_NAME = 'bt:article'
IE_DESC = 'Bergens Tidende Articles'
- _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
+ _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html'
_TEST = {
'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html',
'md5': '2acbe8ad129b3469d5ae51b1158878df',
@@ -234,7 +241,7 @@ class BTArticleIE(InfoExtractor):
class BTVestlendingenIE(InfoExtractor):
IE_NAME = 'bt:vestlendingen'
IE_DESC = 'Bergens Tidende - Vestlendingen'
- _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588',
'md5': 'd7d17e3337dc80de6d3a540aefbe441b',
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 3db6286..e2b2ce0 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -1,31 +1,47 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
-from .ooyala import OoyalaIE
from ..utils import ExtractorError
class ViceIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)'
-
- _TESTS = [
- {
- 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
- 'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
- 'ext': 'mp4',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- 'duration': 725.983,
- },
- 'params': {
- # Requires ffmpeg (m3u8 manifest)
- 'skip_download': True,
- },
- }, {
- 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
- 'only_matching': True,
- }
- ]
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
+ 'md5': 'e9d77741f9e42ba583e683cd170660f7',
+ 'info_dict': {
+ 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'ext': 'flv',
+ 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
+ 'duration': 725.983,
+ },
+ 'add_ie': ['Ooyala'],
+ }, {
+ 'url': 'http://www.vice.com/video/how-to-hack-a-car',
+ 'md5': '6fb2989a3fed069fb8eab3401fc2d3c9',
+ 'info_dict': {
+ 'id': '3jstaBeXgAs',
+ 'ext': 'mp4',
+ 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
+ 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
+ 'uploader_id': 'MotherboardTV',
+ 'uploader': 'Motherboard',
+ 'upload_date': '20140529',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -33,8 +49,43 @@ class ViceIE(InfoExtractor):
try:
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
- 'ooyala embed code')
- ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
+ 'ooyala embed code', default=None)
+ if embed_code:
+ return self.url_result('ooyala:%s' % embed_code, 'Ooyala')
+ youtube_id = self._search_regex(
+ r'data-youtube-id="([^"]+)"', webpage, 'youtube id')
+ return self.url_result(youtube_id, 'Youtube')
except ExtractorError:
raise ExtractorError('The page doesn\'t contain a video', expected=True)
- return self.url_result(ooyala_url, ie='Ooyala')
+
+
+class ViceShowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
+
+ _TEST = {
+ 'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2',
+ 'info_dict': {
+ 'id': 'fuck-thats-delicious-2',
+ 'title': "Fuck, That's Delicious",
+ 'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.',
+ },
+ 'playlist_count': 17,
+ }
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
+
+ entries = [
+ self.url_result(video_url, ViceIE.ie_key())
+ for video_url, _ in re.findall(
+ r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"'
+ % ViceIE._VALID_URL, webpage)]
+
+ title = self._search_regex(
+ r'<title>(.+?)</title>', webpage, 'title', default=None)
+ if title:
+ title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
+ description = self._html_search_meta('description', webpage, 'description')
+
+ return self.playlist_result(entries, show_id, title, description)
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py
index 6bfbd4d..8d92aee 100644
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -2,7 +2,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -93,7 +93,7 @@ class ViddlerIE(InfoExtractor):
headers = {'Referer': 'http://static.cdn-ec.viddler.com/js/arpeggio/v2/embed.html'}
request = sanitized_Request(
'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?%s'
- % compat_urllib_parse.urlencode(query), None, headers)
+ % compat_urllib_parse_urlencode(query), None, headers)
data = self._download_json(request, video_id)['video']
formats = []
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
index 0ffc7ff..2ed5d96 100644
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@@ -14,8 +14,11 @@ class VideoDetectiveIE(InfoExtractor):
'id': '194487',
'ext': 'mp4',
'title': 'KICK-ASS 2',
- 'description': 'md5:65ba37ad619165afac7d432eaded6013',
- 'duration': 138,
+ 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
}
@@ -24,4 +27,4 @@ class VideoDetectiveIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
og_video = self._og_search_video_url(webpage)
query = compat_urlparse.urlparse(og_video).query
- return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key())
+ return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py
index 5e2e7cb..4f0dcd1 100644
--- a/youtube_dl/extractor/videomega.py
+++ b/youtube_dl/extractor/videomega.py
@@ -4,11 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import sanitized_Request
+from ..utils import (
+ decode_packed_codes,
+ sanitized_Request,
+)
class VideoMegaIE(InfoExtractor):
- _WORKING = False
_VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA',
@@ -42,8 +44,10 @@ class VideoMegaIE(InfoExtractor):
r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title)
thumbnail = self._search_regex(
r'<video[^>]+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
+
+ real_codes = decode_packed_codes(webpage)
video_url = self._search_regex(
- r'<source[^>]+?src="([^"]+)"', webpage, 'video URL')
+ r'"src"\s*,\s*"([^"]+)"', real_codes, 'video URL')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py
index 0bd1e1e..04e95c6 100644
--- a/youtube_dl/extractor/videomore.py
+++ b/youtube_dl/extractor/videomore.py
@@ -111,6 +111,7 @@ class VideomoreIE(InfoExtractor):
video_url = xpath_text(video, './/video_url', 'video url', fatal=True)
formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds')
+ self._sort_formats(formats)
data = self._download_json(
'http://videomore.ru/video/tracks/%s.json' % video_id,
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
index 2cd3650..0f79871 100644
--- a/youtube_dl/extractor/videott.py
+++ b/youtube_dl/extractor/videott.py
@@ -14,7 +14,7 @@ class VideoTtIE(InfoExtractor):
_WORKING = False
ID_NAME = 'video.tt'
IE_DESC = 'video.tt - Your True Tube'
- _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
+ _VALID_URL = r'https?://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
_TESTS = [{
'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8',
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
new file mode 100644
index 0000000..6898042
--- /dev/null
+++ b/youtube_dl/extractor/vidio.py
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VidioIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015',
+ 'md5': 'cd2801394afc164e9775db6a140b91fe',
+ 'info_dict': {
+ 'id': '165683',
+ 'display_id': 'dj_ambred-booyah-live-2015',
+ 'ext': 'mp4',
+ 'title': 'DJ_AMBRED - Booyah (Live 2015)',
+ 'description': 'md5:27dc15f819b6a78a626490881adbadf8',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'duration': 149,
+ 'like_count': int,
+ },
+ }, {
+ 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id, display_id = mobj.group('id', 'display_id')
+
+ webpage = self._download_webpage(url, display_id)
+
+ title = self._og_search_title(webpage)
+
+ m3u8_url, duration, thumbnail = [None] * 3
+
+ clips = self._parse_json(
+ self._html_search_regex(
+ r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1',
+ webpage, 'video data', default='[]', group='data'),
+ display_id, fatal=False)
+ if clips:
+ clip = clips[0]
+ m3u8_url = clip.get('sources', [{}])[0].get('file')
+ duration = clip.get('clip_duration')
+ thumbnail = clip.get('image')
+
+ m3u8_url = m3u8_url or self._search_regex(
+ r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url')
+ formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+
+ duration = int_or_none(duration or self._search_regex(
+ r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
+ thumbnail = thumbnail or self._og_search_thumbnail(webpage)
+
+ like_count = int_or_none(self._search_regex(
+ (r'<span[^>]+data-comment-vote-count=["\'](\d+)',
+ r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'),
+ webpage, 'like count', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'like_count': like_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index 7c6e980..3c78fb3 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -1,11 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import smuggle_url
+from .jwplatform import JWPlatformBaseIE
+from ..utils import (
+ decode_packed_codes,
+ js_to_json,
+)
-class VidziIE(InfoExtractor):
+class VidziIE(JWPlatformBaseIE):
_VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
_TEST = {
'url': 'http://vidzi.tv/cghql9yq6emu.html',
@@ -14,7 +17,6 @@ class VidziIE(InfoExtractor):
'id': 'cghql9yq6emu',
'ext': 'mp4',
'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
- 'uploader': 'vidzi.tv',
},
'params': {
# m3u8 download
@@ -29,11 +31,12 @@ class VidziIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
- # Vidzi now uses jwplayer, which can be handled by GenericIE
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'title': title,
- 'url': smuggle_url(url, {'to_generic': True}),
- 'ie_key': 'Generic',
- }
+ code = decode_packed_codes(webpage).replace('\\\'', '\'')
+ jwplayer_data = self._parse_json(
+ self._search_regex(r'setup\(([^)]+)\)', code, 'jwplayer data'),
+ video_id, transform_source=js_to_json)
+
+ info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
+ info_dict['title'] = title
+
+ return info_dict
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index c76c206..6645c61 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -50,6 +50,7 @@ class VierIE(InfoExtractor):
playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename)
formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4')
+ self._sort_formats(formats)
title = self._og_search_title(webpage, default=display_id)
description = self._og_search_description(webpage, default=None)
diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/viewlift.py
index 6977afb..19500eb 100644
--- a/youtube_dl/extractor/snagfilms.py
+++ b/youtube_dl/extractor/viewlift.py
@@ -13,8 +13,12 @@ from ..utils import (
)
-class SnagFilmsEmbedIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
+class ViewLiftBaseIE(InfoExtractor):
+ _DOMAINS_REGEX = '(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv'
+
+
+class ViewLiftEmbedIE(ViewLiftBaseIE):
+ _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX
_TESTS = [{
'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
'md5': '2924e9215c6eff7a55ed35b72276bd93',
@@ -40,7 +44,7 @@ class SnagFilmsEmbedIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX,
webpage)
if mobj:
return mobj.group('url')
@@ -55,6 +59,7 @@ class SnagFilmsEmbedIE(InfoExtractor):
'Film %s is not playable in your area.' % video_id, expected=True)
formats = []
+ has_bitrate = False
for source in self._parse_json(js_to_json(self._search_regex(
r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
file_ = source.get('file')
@@ -63,22 +68,25 @@ class SnagFilmsEmbedIE(InfoExtractor):
type_ = source.get('type')
ext = determine_ext(file_)
format_id = source.get('label') or ext
- if all(v == 'm3u8' for v in (type_, ext)):
+ if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)):
formats.extend(self._extract_m3u8_formats(
file_, video_id, 'mp4', m3u8_id='hls'))
else:
bitrate = int_or_none(self._search_regex(
[r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
file_, 'bitrate', default=None))
+ if not has_bitrate and bitrate:
+ has_bitrate = True
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', format_id, 'height', default=None))
formats.append({
'url': file_,
- 'format_id': format_id,
+ 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')),
'tbr': bitrate,
'height': height,
})
- self._sort_formats(formats)
+ field_preference = None if has_bitrate else ('height', 'tbr', 'format_id')
+ self._sort_formats(formats, field_preference)
title = self._search_regex(
[r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
@@ -91,8 +99,8 @@ class SnagFilmsEmbedIE(InfoExtractor):
}
-class SnagFilmsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
+class ViewLiftIE(ViewLiftBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)/(?:films/title|show|(?:news/)?videos?)/(?P<id>[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX
_TESTS = [{
'url': 'http://www.snagfilms.com/films/title/lost_for_life',
'md5': '19844f897b35af219773fd63bdec2942',
@@ -127,10 +135,20 @@ class SnagFilmsIE(InfoExtractor):
# Film is not available.
'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
'only_matching': True,
+ }, {
+ 'url': 'http://www.winnersview.com/videos/the-good-son',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.kesari.tv/news/video/1461919076414',
+ 'only_matching': True,
+ }, {
+ # Was once Kaltura embed
+ 'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ domain, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
@@ -170,7 +188,7 @@ class SnagFilmsIE(InfoExtractor):
return {
'_type': 'url_transparent',
- 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
+ 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),
'id': film_id,
'display_id': display_id,
'title': title,
@@ -178,4 +196,5 @@ class SnagFilmsIE(InfoExtractor):
'thumbnail': thumbnail,
'duration': duration,
'categories': categories,
+ 'ie_key': 'ViewLiftEmbed',
}
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index fe94a47..a93196a 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -1,10 +1,11 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
- compat_urllib_parse,
compat_urllib_parse_unquote,
)
from ..utils import (
@@ -14,6 +15,7 @@ from ..utils import (
parse_iso8601,
sanitized_Request,
HEADRequest,
+ url_basename,
)
@@ -75,11 +77,11 @@ class ViewsterIE(InfoExtractor):
_ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01'
- def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True):
+ def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}):
request = sanitized_Request(url)
request.add_header('Accept', self._ACCEPT_HEADER)
request.add_header('Auth-token', self._AUTH_TOKEN)
- return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal)
+ return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query)
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -114,43 +116,85 @@ class ViewsterIE(InfoExtractor):
return self.playlist_result(entries, video_id, title, description)
formats = []
- for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):
- media = self._download_json(
- 'https://public-api.viewster.com/movies/%s/video?mediaType=%s'
- % (entry_id, compat_urllib_parse.quote(media_type)),
- video_id, 'Downloading %s JSON' % media_type, fatal=False)
- if not media:
- continue
- video_url = media.get('Uri')
- if not video_url:
- continue
- ext = determine_ext(video_url)
- if ext == 'f4m':
- video_url += '&' if '?' in video_url else '?'
- video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
- formats.extend(self._extract_f4m_formats(
- video_url, video_id, f4m_id='hds'))
- elif ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- video_url, video_id, 'mp4', m3u8_id='hls',
- fatal=False) # m3u8 sometimes fail
- if m3u8_formats:
- formats.extend(m3u8_formats)
- else:
- format_id = media.get('Bitrate')
- f = {
- 'url': video_url,
- 'format_id': 'mp4-%s' % format_id,
- 'height': int_or_none(media.get('Height')),
- 'width': int_or_none(media.get('Width')),
- 'preference': 1,
- }
- if format_id and not f['height']:
- f['height'] = int_or_none(self._search_regex(
- r'^(\d+)[pP]$', format_id, 'height', default=None))
- formats.append(f)
-
- if not formats and not info.get('LanguageSets') and not info.get('VODSettings'):
+ for language_set in info.get('LanguageSets', []):
+ manifest_url = None
+ m3u8_formats = []
+ audio = language_set.get('Audio') or ''
+ subtitle = language_set.get('Subtitle') or ''
+ base_format_id = audio
+ if subtitle:
+ base_format_id += '-%s' % subtitle
+
+ def concat(suffix, sep='-'):
+ return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix
+
+ for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'):
+ media = self._download_json(
+ 'https://public-api.viewster.com/movies/%s/video' % entry_id,
+ video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={
+ 'mediaType': media_type,
+ 'language': audio,
+ 'subtitle': subtitle,
+ })
+ if not media:
+ continue
+ video_url = media.get('Uri')
+ if not video_url:
+ continue
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
+ manifest_url = video_url
+ video_url += '&' if '?' in video_url else '?'
+ video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1'
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id=concat('hds')))
+ elif ext == 'm3u8':
+ manifest_url = video_url
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', m3u8_id=concat('hls'),
+ fatal=False) # m3u8 sometimes fail
+ if m3u8_formats:
+ formats.extend(m3u8_formats)
+ else:
+ qualities_basename = self._search_regex(
+ '/([^/]+)\.csmil/',
+ manifest_url, 'qualities basename', default=None)
+ if not qualities_basename:
+ continue
+ QUALITIES_RE = r'((,\d+k)+,?)'
+ qualities = self._search_regex(
+ QUALITIES_RE, qualities_basename,
+ 'qualities', default=None)
+ if not qualities:
+ continue
+ qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(',')))
+ qualities.sort()
+ http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename)
+ http_url_basename = url_basename(video_url)
+ if m3u8_formats:
+ self._sort_formats(m3u8_formats)
+ m3u8_formats = list(filter(
+ lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ m3u8_formats))
+ if len(qualities) == len(m3u8_formats):
+ for q, m3u8_format in zip(qualities, m3u8_formats):
+ f = m3u8_format.copy()
+ f.update({
+ 'url': video_url.replace(http_url_basename, http_template % q),
+ 'format_id': f['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
+ })
+ formats.append(f)
+ else:
+ for q in qualities:
+ formats.append({
+ 'url': video_url.replace(http_url_basename, http_template % q),
+ 'ext': 'mp4',
+ 'format_id': 'http-%d' % q,
+ 'tbr': q,
+ })
+
+ if not formats and not info.get('VODSettings'):
self.raise_geo_restricted()
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py
index 315984b..a4f914d 100644
--- a/youtube_dl/extractor/viidea.py
+++ b/youtube_dl/extractor/viidea.py
@@ -15,7 +15,7 @@ from ..utils import (
class ViideaIE(InfoExtractor):
- _VALID_URL = r'''(?x)http://(?:www\.)?(?:
+ _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
videolectures\.net|
flexilearn\.viidea\.net|
presentations\.ocwconsortium\.org|
@@ -151,6 +151,7 @@ class ViideaIE(InfoExtractor):
smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id)
smil = self._download_smil(smil_url, lecture_id)
info = self._parse_smil(smil, smil_url, lecture_id)
+ self._sort_formats(info['formats'])
info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id)
info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id)
if multipart:
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index 433fc99..efa15e0 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -101,10 +101,13 @@ class VikiBaseIE(InfoExtractor):
self.report_warning('Unable to get session token, login has probably failed')
@staticmethod
- def dict_selection(dict_obj, preferred_key):
+ def dict_selection(dict_obj, preferred_key, allow_fallback=True):
if preferred_key in dict_obj:
return dict_obj.get(preferred_key)
+ if not allow_fallback:
+ return
+
filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()]))
return filtered_dict[0] if filtered_dict else None
@@ -127,7 +130,7 @@ class VikiIE(VikiBaseIE):
}, {
# clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
- 'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
+ 'md5': 'feea2b1d7b3957f70886e6dfd8b8be84',
'info_dict': {
'id': '1067139v',
'ext': 'mp4',
@@ -156,17 +159,18 @@ class VikiIE(VikiBaseIE):
'params': {
# m3u8 download
'skip_download': True,
- }
+ },
+ 'skip': 'Blocked in the US',
}, {
# episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
- 'md5': '190f3ef426005ba3a080a63325955bc3',
+ 'md5': '1f54697dabc8f13f31bf06bb2e4de6db',
'info_dict': {
'id': '44699v',
'ext': 'mp4',
'title': 'Boys Over Flowers - Episode 1',
- 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2',
- 'duration': 4155,
+ 'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
+ 'duration': 4204,
'timestamp': 1270496524,
'upload_date': '20100405',
'uploader': 'group8',
@@ -176,13 +180,13 @@ class VikiIE(VikiBaseIE):
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
- 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b',
+ 'md5': '63f8600c1da6f01b7640eee7eca4f1da',
'info_dict': {
'id': '50562v',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': 'Poor Nastya [COMPLETE] - Episode 1',
'description': '',
- 'duration': 607,
+ 'duration': 606,
'timestamp': 1274949505,
'upload_date': '20101213',
'uploader': 'ad14065n',
@@ -196,7 +200,7 @@ class VikiIE(VikiBaseIE):
}, {
# non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic',
- 'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+ 'md5': '013dc282714e22acf9447cad14ff1208',
'info_dict': {
'id': '158036v',
'ext': 'mp4',
@@ -217,7 +221,7 @@ class VikiIE(VikiBaseIE):
self._check_errors(video)
- title = self.dict_selection(video.get('titles', {}), 'en')
+ title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
if not title:
title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
container_titles = video.get('container', {}).get('titles', {})
@@ -302,7 +306,7 @@ class VikiChannelIE(VikiBaseIE):
'title': 'Boys Over Flowers',
'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
},
- 'playlist_count': 70,
+ 'playlist_mincount': 71,
}, {
'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete',
'info_dict': {
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 3049dff..d9c9852 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -8,14 +8,15 @@ import itertools
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
+ compat_str,
compat_urlparse,
)
from ..utils import (
determine_ext,
- encode_dict,
ExtractorError,
InAdvancePagedList,
int_or_none,
+ NO_DEFAULT,
RegexNotFoundError,
sanitized_Request,
smuggle_url,
@@ -25,6 +26,7 @@ from ..utils import (
urlencode_postdata,
unescapeHTML,
parse_filesize,
+ try_get,
)
@@ -42,19 +44,39 @@ class VimeoBaseInfoExtractor(InfoExtractor):
self.report_login()
webpage = self._download_webpage(self._LOGIN_URL, None, False)
token, vuid = self._extract_xsrft_and_vuid(webpage)
- data = urlencode_postdata(encode_dict({
+ data = urlencode_postdata({
'action': 'login',
'email': username,
'password': password,
'service': 'vimeo',
'token': token,
- }))
+ })
login_request = sanitized_Request(self._LOGIN_URL, data)
login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
login_request.add_header('Referer', self._LOGIN_URL)
self._set_vimeo_cookie('vuid', vuid)
self._download_webpage(login_request, None, False, 'Wrong login info')
+ def _verify_video_password(self, url, video_id, webpage):
+ password = self._downloader.params.get('videopassword')
+ if password is None:
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ data = urlencode_postdata({
+ 'password': password,
+ 'token': token,
+ })
+ if url.startswith('http://'):
+ # vimeo only supports https now, but the user can give an http url
+ url = url.replace('http://', 'https://')
+ password_request = sanitized_Request(url + '/password', data)
+ password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ password_request.add_header('Referer', url)
+ self._set_vimeo_cookie('vuid', vuid)
+ return self._download_webpage(
+ password_request, video_id,
+ 'Verifying the password', 'Wrong password')
+
def _extract_xsrft_and_vuid(self, webpage):
xsrft = self._search_regex(
r'(?:(?P<q1>["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P<q>["\'])(?P<xsrft>.+?)(?P=q)',
@@ -67,21 +89,96 @@ class VimeoBaseInfoExtractor(InfoExtractor):
def _set_vimeo_cookie(self, name, value):
self._set_cookie('vimeo.com', name, value)
+ def _vimeo_sort_formats(self, formats):
+ # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
+ # at the same time without actual units specified. This lead to wrong sorting.
+ self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id'))
+
+ def _parse_config(self, config, video_id):
+ # Extract title
+ video_title = config['video']['title']
+
+ # Extract uploader, uploader_url and uploader_id
+ video_uploader = config['video'].get('owner', {}).get('name')
+ video_uploader_url = config['video'].get('owner', {}).get('url')
+ video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None
+
+ # Extract video thumbnail
+ video_thumbnail = config['video'].get('thumbnail')
+ if video_thumbnail is None:
+ video_thumbs = config['video'].get('thumbs')
+ if video_thumbs and isinstance(video_thumbs, dict):
+ _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
+
+ # Extract video duration
+ video_duration = int_or_none(config['video'].get('duration'))
+
+ formats = []
+ config_files = config['video'].get('files') or config['request'].get('files', {})
+ for f in config_files.get('progressive', []):
+ video_url = f.get('url')
+ if not video_url:
+ continue
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'http-%s' % f.get('quality'),
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'fps': int_or_none(f.get('fps')),
+ 'tbr': int_or_none(f.get('bitrate')),
+ })
+ m3u8_url = config_files.get('hls', {}).get('url')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+
+ subtitles = {}
+ text_tracks = config['request'].get('text_tracks')
+ if text_tracks:
+ for tt in text_tracks:
+ subtitles[tt['lang']] = [{
+ 'ext': 'vtt',
+ 'url': 'https://vimeo.com' + tt['url'],
+ }]
+
+ return {
+ 'title': video_title,
+ 'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
+ 'uploader_url': video_uploader_url,
+ 'thumbnail': video_thumbnail,
+ 'duration': video_duration,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
class VimeoIE(VimeoBaseInfoExtractor):
"""Information extractor for vimeo.com."""
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'''(?x)
- https?://
- (?:(?:www|(?P<player>player))\.)?
- vimeo(?P<pro>pro)?\.com/
- (?!channels/[^/?#]+/?(?:$|[?#])|album/)
- (?:.*?/)?
- (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)?
- (?:videos?/)?
- (?P<id>[0-9]+)
- /?(?:[?&].*)?(?:[#].*)?$'''
+ https?://
+ (?:
+ (?:
+ www|
+ (?P<player>player)
+ )
+ \.
+ )?
+ vimeo(?P<pro>pro)?\.com/
+ (?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
+ (?:.*?/)?
+ (?:
+ (?:
+ play_redirect_hls|
+ moogaloop\.swf)\?clip_id=
+ )?
+ (?:videos?/)?
+ (?P<id>[0-9]+)
+ (?:/[\da-f]+)?
+ /?(?:[?&].*)?(?:[#].*)?$
+ '''
IE_NAME = 'vimeo'
_TESTS = [
{
@@ -93,6 +190,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
'description': 'md5:2d3305bad981a06ff79f027f19865021',
'upload_date': '20121220',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434',
'uploader_id': 'user7108434',
'uploader': 'Filippo Valsorda',
'duration': 10,
@@ -105,6 +203,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'info_dict': {
'id': '68093876',
'ext': 'mp4',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus',
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
@@ -121,6 +220,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
'uploader': 'The BLN & Business of Software',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
'uploader_id': 'theblnbusinessofsoftware',
'duration': 3610,
'description': None,
@@ -135,10 +235,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'youtube-dl password protected test video',
'upload_date': '20130614',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128',
'uploader_id': 'user18948128',
'uploader': 'Jaime Marquínez Ferrándiz',
'duration': 10,
- 'description': 'This is "youtube-dl password protected test video" by Jaime Marquínez Ferrándiz on Vimeo, the home for high quality videos and the people\u2026',
+ 'description': 'This is "youtube-dl password protected test video" by on Vimeo, the home for high quality videos and the people who love them.',
},
'params': {
'videopassword': 'youtube-dl',
@@ -147,13 +248,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
{
'url': 'http://vimeo.com/channels/keypeele/75629013',
'md5': '2f86a05afe9d7abc0b9126d229bbe15d',
- 'note': 'Video is freely available via original URL '
- 'and protected with password when accessed via http://vimeo.com/75629013',
'info_dict': {
'id': '75629013',
'ext': 'mp4',
'title': 'Key & Peele: Terrorist Interrogation',
'description': 'md5:8678b246399b070816b12313e8b4eb5c',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio',
'uploader_id': 'atencio',
'uploader': 'Peter Atencio',
'upload_date': '20130927',
@@ -169,6 +269,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'title': 'The New Vimeo Player (You Know, For Videos)',
'description': 'md5:2ec900bf97c3f389378a96aee11260ea',
'upload_date': '20131015',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff',
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
@@ -183,24 +284,49 @@ class VimeoIE(VimeoBaseInfoExtractor):
'ext': 'mp4',
'title': 'Pier Solar OUYA Official Trailer',
'uploader': 'Tulio Gonçalves',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593',
'uploader_id': 'user28849593',
},
},
{
# contains original format
'url': 'https://vimeo.com/33951933',
- 'md5': '53c688fa95a55bf4b7293d37a89c5c53',
+ 'md5': '2d9f5475e0537f013d0073e812ab89e6',
'info_dict': {
'id': '33951933',
'ext': 'mp4',
'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',
'uploader': 'The DMCI',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci',
'uploader_id': 'dmci',
'upload_date': '20111220',
'description': 'md5:ae23671e82d05415868f7ad1aec21147',
},
},
{
+ # only available via https://vimeo.com/channels/tributes/6213729 and
+ # not via https://vimeo.com/6213729
+ 'url': 'https://vimeo.com/channels/tributes/6213729',
+ 'info_dict': {
+ 'id': '6213729',
+ 'ext': 'mp4',
+ 'title': 'Vimeo Tribute: The Shining',
+ 'uploader': 'Casey Donahue',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/caseydonahue',
+ 'uploader_id': 'caseydonahue',
+ 'upload_date': '20090821',
+ 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Unable to download JSON metadata'],
+ },
+ {
+ 'url': 'http://vimeo.com/moogaloop.swf?clip_id=2539741',
+ 'only_matching': True,
+ },
+ {
'url': 'https://vimeo.com/109815029',
'note': 'Video not completely processed, "failed" seed status',
'only_matching': True,
@@ -210,10 +336,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
'only_matching': True,
},
{
+ 'url': 'https://vimeo.com/album/2632481/video/79010983',
+ 'only_matching': True,
+ },
+ {
# source file returns 403: Forbidden
'url': 'https://vimeo.com/7809605',
'only_matching': True,
},
+ {
+ 'url': 'https://vimeo.com/160743502/abd0e13fb4',
+ 'only_matching': True,
+ }
]
@staticmethod
@@ -231,47 +365,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
if mobj:
return mobj.group(1)
- def _verify_video_password(self, url, video_id, webpage):
- password = self._downloader.params.get('videopassword')
- if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
- token, vuid = self._extract_xsrft_and_vuid(webpage)
- data = urlencode_postdata(encode_dict({
- 'password': password,
- 'token': token,
- }))
- if url.startswith('http://'):
- # vimeo only supports https now, but the user can give an http url
- url = url.replace('http://', 'https://')
- password_request = sanitized_Request(url + '/password', data)
- password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- password_request.add_header('Referer', url)
- self._set_vimeo_cookie('vuid', vuid)
- return self._download_webpage(
- password_request, video_id,
- 'Verifying the password', 'Wrong password')
-
def _verify_player_video_password(self, url, video_id):
password = self._downloader.params.get('videopassword')
if password is None:
raise ExtractorError('This video is protected by a password, use the --video-password option')
- data = urlencode_postdata(encode_dict({'password': password}))
+ data = urlencode_postdata({'password': password})
pass_url = url + '/check-password'
password_request = sanitized_Request(pass_url, data)
password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ password_request.add_header('Referer', url)
return self._download_json(
password_request, video_id,
- 'Verifying the password',
- 'Wrong password')
+ 'Verifying the password', 'Wrong password')
def _real_initialize(self):
self._login()
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
- headers = std_headers
+ headers = std_headers.copy()
if 'http_headers' in data:
- headers = headers.copy()
headers.update(data['http_headers'])
if 'Referer' not in headers:
headers['Referer'] = url
@@ -282,11 +395,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
orig_url = url
if mobj.group('pro') or mobj.group('player'):
url = 'https://player.vimeo.com/video/' + video_id
- else:
+ elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
# Retrieve video webpage to extract further information
- request = sanitized_Request(url, None, headers)
+ request = sanitized_Request(url, headers=headers)
try:
webpage = self._download_webpage(request, video_id)
except ExtractorError as ee:
@@ -360,27 +473,24 @@ class VimeoIE(VimeoBaseInfoExtractor):
if config.get('view') == 4:
config = self._verify_player_video_password(url, video_id)
- if '>You rented this title.<' in webpage:
+ def is_rented():
+ if '>You rented this title.<' in webpage:
+ return True
+ if config.get('user', {}).get('purchased'):
+ return True
+ label = try_get(
+ config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str)
+ if label and label.startswith('You rented this'):
+ return True
+ return False
+
+ if is_rented():
feature_id = config.get('video', {}).get('vod', {}).get('feature_id')
if feature_id and not data.get('force_feature_id', False):
return self.url_result(smuggle_url(
'https://player.vimeo.com/player/%s' % feature_id,
{'force_feature_id': True}), 'Vimeo')
- # Extract title
- video_title = config['video']['title']
-
- # Extract uploader and uploader_id
- video_uploader = config['video']['owner']['name']
- video_uploader_id = config['video']['owner']['url'].split('/')[-1] if config['video']['owner']['url'] else None
-
- # Extract video thumbnail
- video_thumbnail = config['video'].get('thumbnail')
- if video_thumbnail is None:
- video_thumbs = config['video'].get('thumbs')
- if video_thumbs and isinstance(video_thumbs, dict):
- _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1]
-
# Extract video description
video_description = self._html_search_regex(
@@ -400,9 +510,6 @@ class VimeoIE(VimeoBaseInfoExtractor):
if not video_description and not mobj.group('player'):
self._downloader.report_warning('Cannot find video description')
- # Extract video duration
- video_duration = int_or_none(config['video'].get('duration'))
-
# Extract upload date
video_upload_date = None
mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage)
@@ -440,52 +547,54 @@ class VimeoIE(VimeoBaseInfoExtractor):
'format_id': source_name,
'preference': 1,
})
- config_files = config['video'].get('files') or config['request'].get('files', {})
- for f in config_files.get('progressive', []):
- video_url = f.get('url')
- if not video_url:
- continue
- formats.append({
- 'url': video_url,
- 'format_id': 'http-%s' % f.get('quality'),
- 'width': int_or_none(f.get('width')),
- 'height': int_or_none(f.get('height')),
- 'fps': int_or_none(f.get('fps')),
- 'tbr': int_or_none(f.get('bitrate')),
- })
- m3u8_url = config_files.get('hls', {}).get('url')
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps
- # at the same time without actual units specified. This lead to wrong sorting.
- self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id'))
-
- subtitles = {}
- text_tracks = config['request'].get('text_tracks')
- if text_tracks:
- for tt in text_tracks:
- subtitles[tt['lang']] = [{
- 'ext': 'vtt',
- 'url': 'https://vimeo.com' + tt['url'],
- }]
- return {
+ info_dict = self._parse_config(config, video_id)
+ formats.extend(info_dict['formats'])
+ self._vimeo_sort_formats(formats)
+ info_dict.update({
'id': video_id,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
+ 'formats': formats,
'upload_date': video_upload_date,
- 'title': video_title,
- 'thumbnail': video_thumbnail,
'description': video_description,
- 'duration': video_duration,
- 'formats': formats,
'webpage_url': url,
'view_count': view_count,
'like_count': like_count,
'comment_count': comment_count,
- 'subtitles': subtitles,
- }
+ })
+
+ return info_dict
+
+
+class VimeoOndemandIE(VimeoBaseInfoExtractor):
+ IE_NAME = 'vimeo:ondemand'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ # ondemand video not available via https://vimeo.com/id
+ 'url': 'https://vimeo.com/ondemand/20704',
+ 'md5': 'c424deda8c7f73c1dfb3edd7630e2f35',
+ 'info_dict': {
+ 'id': '105442900',
+ 'ext': 'mp4',
+ 'title': 'המעבדה - במאי יותם פלדמן',
+ 'uploader': 'גם סרטים',
+ 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms',
+ 'uploader_id': 'gumfilms',
+ },
+ }, {
+ 'url': 'https://vimeo.com/ondemand/nazmaalik',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vimeo.com/ondemand/141692381',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://vimeo.com/ondemand/thelastcolony/150274832',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key())
class VimeoChannelIE(VimeoBaseInfoExtractor):
@@ -523,7 +632,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
token, vuid = self._extract_xsrft_and_vuid(webpage)
fields['token'] = token
fields['password'] = password
- post = urlencode_postdata(encode_dict(fields))
+ post = urlencode_postdata(fields)
password_path = self._search_regex(
r'action="([^"]+)"', login_form, 'password URL')
password_url = compat_urlparse.urljoin(page_url, password_path)
@@ -547,8 +656,21 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
webpage = self._login_list_password(page_url, list_id, webpage)
yield self._extract_list_title(webpage)
- for video_id in re.findall(r'id="clip_(\d+?)"', webpage):
- yield self.url_result('https://vimeo.com/%s' % video_id, 'Vimeo')
+ # Try extracting href first since not all videos are available via
+ # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729)
+ clips = re.findall(
+ r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage)
+ if clips:
+ for video_id, video_url in clips:
+ yield self.url_result(
+ compat_urlparse.urljoin(base_url, video_url),
+ VimeoIE.ie_key(), video_id=video_id)
+ # More relaxed fallback
+ else:
+ for video_id in re.findall(r'id=["\']clip_(\d+)', webpage):
+ yield self.url_result(
+ 'https://vimeo.com/%s' % video_id,
+ VimeoIE.ie_key(), video_id=video_id)
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
@@ -585,7 +707,7 @@ class VimeoUserIE(VimeoChannelIE):
class VimeoAlbumIE(VimeoChannelIE):
IE_NAME = 'vimeo:album'
- _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)'
+ _VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
_TESTS = [{
'url': 'https://vimeo.com/album/2632481',
@@ -605,6 +727,13 @@ class VimeoAlbumIE(VimeoChannelIE):
'params': {
'videopassword': 'youtube-dl',
}
+ }, {
+ 'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
+ 'only_matching': True,
+ }, {
+ # TODO: respect page number
+ 'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
+ 'only_matching': True,
}]
def _page_url(self, base_url, pagenum):
@@ -636,7 +765,7 @@ class VimeoGroupsIE(VimeoAlbumIE):
return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name)
-class VimeoReviewIE(InfoExtractor):
+class VimeoReviewIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:review'
IE_DESC = 'Review pages on vimeo'
_VALID_URL = r'https://vimeo\.com/[^/]+/review/(?P<id>[^/]+)'
@@ -648,6 +777,7 @@ class VimeoReviewIE(InfoExtractor):
'ext': 'mp4',
'title': "DICK HARDWICK 'Comedian'",
'uploader': 'Richard Hardwick',
+ 'uploader_id': 'user21297594',
}
}, {
'note': 'video player needs Referer',
@@ -660,14 +790,45 @@ class VimeoReviewIE(InfoExtractor):
'uploader': 'DevWeek Events',
'duration': 2773,
'thumbnail': 're:^https?://.*\.jpg$',
+ 'uploader_id': 'user22258446',
}
+ }, {
+ 'note': 'Password protected',
+ 'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
+ 'info_dict': {
+ 'id': '138823582',
+ 'ext': 'mp4',
+ 'title': 'EFFICIENT PICKUP MASTERCLASS MODULE 1',
+ 'uploader': 'TMB',
+ 'uploader_id': 'user37284429',
+ },
+ 'params': {
+ 'videopassword': 'holygrail',
+ },
}]
+ def _real_initialize(self):
+ self._login()
+
+ def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
+ webpage = self._download_webpage(webpage_url, video_id)
+ config_url = self._html_search_regex(
+ r'data-config-url="([^"]+)"', webpage, 'config URL',
+ default=NO_DEFAULT if video_password_verified else None)
+ if config_url is None:
+ self._verify_video_password(webpage_url, video_id, webpage)
+ config_url = self._get_config_url(
+ webpage_url, video_id, video_password_verified=True)
+ return config_url
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- player_url = 'https://player.vimeo.com/player/' + video_id
- return self.url_result(player_url, 'Vimeo', video_id)
+ video_id = self._match_id(url)
+ config_url = self._get_config_url(url, video_id)
+ config = self._download_json(config_url, video_id)
+ info_dict = self._parse_config(config, video_id)
+ self._vimeo_sort_formats(info_dict['formats'])
+ info_dict['id'] = video_id
+ return info_dict
class VimeoWatchLaterIE(VimeoChannelIE):
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index a6a6cc4..5b80184 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -24,6 +24,7 @@ class VineIE(InfoExtractor):
'upload_date': '20130519',
'uploader': 'Jack Dorsey',
'uploader_id': '76',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -39,6 +40,7 @@ class VineIE(InfoExtractor):
'upload_date': '20140815',
'uploader': 'Mars Ruiz',
'uploader_id': '1102363502380728320',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -54,6 +56,7 @@ class VineIE(InfoExtractor):
'upload_date': '20130430',
'uploader': 'Z3k3',
'uploader_id': '936470460173008896',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -71,6 +74,7 @@ class VineIE(InfoExtractor):
'upload_date': '20150705',
'uploader': 'Pimry_zaa',
'uploader_id': '1135760698325307392',
+ 'view_count': int,
'like_count': int,
'comment_count': int,
'repost_count': int,
@@ -109,6 +113,7 @@ class VineIE(InfoExtractor):
'upload_date': unified_strdate(data.get('created')),
'uploader': username,
'uploader_id': data.get('userIdStr'),
+ 'view_count': int_or_none(data.get('loops', {}).get('count')),
'like_count': int_or_none(data.get('likes', {}).get('count')),
'comment_count': int_or_none(data.get('comments', {}).get('count')),
'repost_count': int_or_none(data.get('reposts', {}).get('count')),
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 0805e3c..cfc5ffd 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -3,19 +3,18 @@ from __future__ import unicode_literals
import re
import json
+import sys
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urllib_parse,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
+ int_or_none,
orderedSet,
- sanitized_Request,
str_to_int,
unescapeHTML,
unified_strdate,
+ urlencode_postdata,
)
from .vimeo import VimeoIE
from .pladform import PladformIE
@@ -27,12 +26,16 @@ class VKIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|
+ (?:
+ (?:m\.)?vk\.com/video_|
+ (?:www\.)?daxab.com/
+ )
+ ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
(?:
(?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
- (?:www\.)?biqle\.ru/watch/
+ (?:www\.)?daxab.com/embed/
)
- (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$)
+ (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
)
'''
_NETRC_MACHINE = 'vk'
@@ -76,7 +79,8 @@ class VKIE(InfoExtractor):
'duration': 101,
'upload_date': '20120730',
'view_count': int,
- }
+ },
+ 'skip': 'This video has been removed from public access.',
},
{
# VIDEO NOW REMOVED
@@ -141,10 +145,10 @@ class VKIE(InfoExtractor):
'url': 'https://vk.com/video276849682_170681728',
'info_dict': {
'id': 'V3K4mi0SYkc',
- 'ext': 'mp4',
+ 'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
- 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
- 'duration': 179,
+ 'description': 'md5:d9903938abdc74c738af77f527ca0596',
+ 'duration': 178,
'upload_date': '20130116',
'uploader': "Children's Joy Foundation",
'uploader_id': 'thecjf',
@@ -152,6 +156,19 @@ class VKIE(InfoExtractor):
},
},
{
+ # video key is extra_data not url\d+
+ 'url': 'http://vk.com/video-110305615_171782105',
+ 'md5': 'e13fcda136f99764872e739d13fac1d1',
+ 'info_dict': {
+ 'id': '171782105',
+ 'ext': 'mp4',
+ 'title': 'S-Dance, репетиции к The way show',
+ 'uploader': 'THE WAY SHOW | 17 апреля',
+ 'upload_date': '20160207',
+ 'view_count': int,
+ },
+ },
+ {
# removed video, just testing that we match the pattern
'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a',
'only_matching': True,
@@ -162,11 +179,6 @@ class VKIE(InfoExtractor):
'only_matching': True,
},
{
- # vk wrapper
- 'url': 'http://www.biqle.ru/watch/847655_160197695',
- 'only_matching': True,
- },
- {
# pladform embed
'url': 'https://vk.com/video-76116461_171554880',
'only_matching': True,
@@ -178,7 +190,7 @@ class VKIE(InfoExtractor):
if username is None:
return
- login_page = self._download_webpage(
+ login_page, url_handle = self._download_webpage_handle(
'https://vk.com', None, 'Downloading login page')
login_form = self._hidden_inputs(login_page)
@@ -188,11 +200,26 @@ class VKIE(InfoExtractor):
'pass': password.encode('cp1251'),
})
- request = sanitized_Request(
- 'https://login.vk.com/?act=login',
- compat_urllib_parse.urlencode(login_form).encode('utf-8'))
+ # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
+ # and expects the first one to be set rather than second (see
+ # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
+ # As of RFC6265 the newer one cookie should be set into cookie store
+ # what actually happens.
+ # We will workaround this VK issue by resetting the remixlhk cookie to
+ # the first one manually.
+ cookies = url_handle.headers.get('Set-Cookie')
+ if sys.version_info[0] >= 3:
+ cookies = cookies.encode('iso-8859-1')
+ cookies = cookies.decode('utf-8')
+ remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
+ if remixlhk:
+ value, domain = remixlhk.groups()
+ self._set_cookie(domain, 'remixlhk', value)
+
login_page = self._download_webpage(
- request, None, note='Logging in as %s' % username)
+ 'https://login.vk.com/?act=login', None,
+ note='Logging in as %s' % username,
+ data=urlencode_postdata(login_form))
if re.search(r'onLoginFailed', login_page):
raise ExtractorError(
@@ -205,20 +232,21 @@ class VKIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
- if not video_id:
+ if video_id:
+ info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+ # Some videos (removed?) can only be downloaded with list id specified
+ list_id = mobj.group('list_id')
+ if list_id:
+ info_url += '&list=%s' % list_id
+ else:
+ info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
- info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
-
- # Some videos (removed?) can only be downloaded with list id specified
- list_id = mobj.group('list_id')
- if list_id:
- info_url += '&list=%s' % list_id
-
info_page = self._download_webpage(info_url, video_id)
error_message = self._html_search_regex(
- r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
info_page, 'error message', default=None)
if error_message:
raise ExtractorError(error_message, expected=True)
@@ -293,17 +321,22 @@ class VKIE(InfoExtractor):
view_count = None
views = self._html_search_regex(
r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
- info_page, 'view count', fatal=False)
+ info_page, 'view count', default=None)
if views:
view_count = str_to_int(self._search_regex(
r'([\d,.]+)', views, 'view count', fatal=False))
- formats = [{
- 'format_id': k,
- 'url': v,
- 'width': int(k[len('url'):]),
- } for k, v in data.items()
- if k.startswith('url')]
+ formats = []
+ for k, v in data.items():
+ if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v:
+ continue
+ height = int_or_none(self._search_regex(
+ r'^(?:url|cache)(\d+)', k, 'height', default=None))
+ formats.append({
+ 'format_id': k,
+ 'url': v,
+ 'height': height,
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index 9e2aa58..8d671cc 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -1,13 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
dict_get,
+ ExtractorError,
float_or_none,
int_or_none,
+ remove_start,
)
-from ..compat import compat_urllib_parse
+from ..compat import compat_urllib_parse_urlencode
class VLiveIE(InfoExtractor):
@@ -19,7 +23,7 @@ class VLiveIE(InfoExtractor):
'info_dict': {
'id': '1326',
'ext': 'mp4',
- 'title': "[V] Girl's Day's Broadcast",
+ 'title': "[V LIVE] Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
},
@@ -31,19 +35,65 @@ class VLiveIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.vlive.tv/video/%s' % video_id, video_id)
- long_video_id = self._search_regex(
- r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"([^"]+)"',
- webpage, 'long video id')
+ video_params = self._search_regex(
+ r'\bvlive\.video\.init\(([^)]+)\)',
+ webpage, 'video params')
+ status, _, _, live_params, long_video_id, key = re.split(
+ r'"\s*,\s*"', video_params)[2:8]
+ status = remove_start(status, 'PRODUCT_')
+
+ if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':
+ live_params = self._parse_json('"%s"' % live_params, video_id)
+ live_params = self._parse_json(live_params, video_id)
+ return self._live(video_id, webpage, live_params)
+ elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':
+ if long_video_id and key:
+ return self._replay(video_id, webpage, long_video_id, key)
+ else:
+ status = 'COMING_SOON'
- key = self._search_regex(
- r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"[^"]+"\s*,\s*"([^"]+)"',
- webpage, 'key')
+ if status == 'LIVE_END':
+ raise ExtractorError('Uploading for replay. Please wait...',
+ expected=True)
+ elif status == 'COMING_SOON':
+ raise ExtractorError('Coming soon!', expected=True)
+ elif status == 'CANCELED':
+ raise ExtractorError('We are sorry, '
+ 'but the live broadcast has been canceled.',
+ expected=True)
+ else:
+ raise ExtractorError('Unknown status %s' % status)
+ def _get_common_fields(self, webpage):
title = self._og_search_title(webpage)
+ creator = self._html_search_regex(
+ r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)',
+ webpage, 'creator', fatal=False)
+ thumbnail = self._og_search_thumbnail(webpage)
+ return {
+ 'title': title,
+ 'creator': creator,
+ 'thumbnail': thumbnail,
+ }
+
+ def _live(self, video_id, webpage, live_params):
+ formats = []
+ for vid in live_params.get('resolutions', []):
+ formats.extend(self._extract_m3u8_formats(
+ vid['cdnUrl'], video_id, 'mp4',
+ m3u8_id=vid.get('name'),
+ fatal=False, live=True))
+ self._sort_formats(formats)
+
+ return dict(self._get_common_fields(webpage),
+ id=video_id,
+ formats=formats,
+ is_live=True)
+ def _replay(self, video_id, webpage, long_video_id, key):
playinfo = self._download_json(
'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
- % compat_urllib_parse.urlencode({
+ % compat_urllib_parse_urlencode({
'videoId': long_video_id,
'key': key,
'ptc': 'http',
@@ -62,11 +112,6 @@ class VLiveIE(InfoExtractor):
} for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
self._sort_formats(formats)
- thumbnail = self._og_search_thumbnail(webpage)
- creator = self._html_search_regex(
- r'<div[^>]+class="info_area"[^>]*>\s*<strong[^>]+class="name"[^>]*>([^<]+)</strong>',
- webpage, 'creator', fatal=False)
-
view_count = int_or_none(playinfo.get('meta', {}).get('count'))
subtitles = {}
@@ -77,12 +122,8 @@ class VLiveIE(InfoExtractor):
'ext': 'vtt',
'url': caption['source']}]
- return {
- 'id': video_id,
- 'title': title,
- 'creator': creator,
- 'thumbnail': thumbnail,
- 'view_count': view_count,
- 'formats': formats,
- 'subtitles': subtitles,
- }
+ return dict(self._get_common_fields(webpage),
+ id=video_id,
+ formats=formats,
+ view_count=view_count,
+ subtitles=subtitles)
diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py
index a97995a..a938a40 100644
--- a/youtube_dl/extractor/vodlocker.py
+++ b/youtube_dl/extractor/vodlocker.py
@@ -2,11 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
ExtractorError,
NO_DEFAULT,
sanitized_Request,
+ urlencode_postdata,
)
@@ -38,7 +38,7 @@ class VodlockerIE(InfoExtractor):
if fields['op'] == 'download1':
self._sleep(3, video_id) # they do detect when requests happen too fast!
- post = compat_urllib_parse.urlencode(fields)
+ post = urlencode_postdata(fields)
req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
webpage = self._download_webpage(
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py
index 93d15a5..4f1a99a 100644
--- a/youtube_dl/extractor/voicerepublic.py
+++ b/youtube_dl/extractor/voicerepublic.py
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_str,
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
determine_ext,
@@ -16,13 +19,13 @@ class VoiceRepublicIE(InfoExtractor):
_VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
_TESTS = [{
'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
- 'md5': '0554a24d1657915aa8e8f84e15dc9353',
+ 'md5': 'b9174d651323f17783000876347116e3',
'info_dict': {
'id': '2296',
'display_id': 'watching-the-watchers-building-a-sousveillance-state',
'ext': 'm4a',
'title': 'Watching the Watchers: Building a Sousveillance State',
- 'description': 'md5:715ba964958afa2398df615809cfecb1',
+ 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.',
'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
'duration': 1800,
'view_count': int,
@@ -52,7 +55,7 @@ class VoiceRepublicIE(InfoExtractor):
if data:
title = data['title']
description = data.get('teaser')
- talk_id = data.get('talk_id') or display_id
+ talk_id = compat_str(data.get('talk_id') or display_id)
talk = data['talk']
duration = int_or_none(talk.get('duration'))
formats = [{
diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py
new file mode 100644
index 0000000..b1b32ad
--- /dev/null
+++ b/youtube_dl/extractor/voxmedia.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+
+
+class VoxMediaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com/(?:[^/]+/)*(?P<id>[^/?]+)'
+ _TESTS = [{
+ 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of',
+ 'md5': '73856edf3e89a711e70d5cf7cb280b37',
+ 'info_dict': {
+ 'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe',
+ 'ext': 'mp4',
+ 'title': 'Google\'s new material design direction',
+ 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2',
+ },
+ 'add_ie': ['Ooyala'],
+ }, {
+ # data-ooyala-id
+ 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
+ 'md5': 'd744484ff127884cd2ba09e3fa604e4b',
+ 'info_dict': {
+ 'id': 'RkZXU4cTphOCPDMZg5oEounJyoFI0g-B',
+ 'ext': 'mp4',
+ 'title': 'The Nexus 6: hands-on with Google\'s phablet',
+ 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af',
+ },
+ 'add_ie': ['Ooyala'],
+ }, {
+ # volume embed
+ 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
+ 'md5': '375c483c5080ab8cd85c9c84cfc2d1e4',
+ 'info_dict': {
+ 'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b',
+ 'ext': 'mp4',
+ 'title': 'The new frontier of LGBTQ civil rights, explained',
+ 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f',
+ },
+ 'add_ie': ['Ooyala'],
+ }, {
+ # youtube embed
+ 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance',
+ 'md5': '83b3080489fb103941e549352d3e0977',
+ 'info_dict': {
+ 'id': 'FcNHTJU1ufM',
+ 'ext': 'mp4',
+ 'title': 'How "the robot" became the greatest novelty dance of all time',
+ 'description': 'md5:b081c0d588b8b2085870cda55e6da176',
+ 'upload_date': '20160324',
+ 'uploader_id': 'voxdotcom',
+ 'uploader': 'Vox',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ # SBN.VideoLinkset.entryGroup multiple ooyala embeds
+ 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
+ 'info_dict': {
+ 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
+ 'title': '25 lies you will tell yourself on National Signing Day',
+ 'description': 'It\'s the most self-delusional time of the year, and everyone\'s gonna tell the same lies together!',
+ },
+ 'playlist': [{
+ 'md5': '721fededf2ab74ae4176c8c8cbfe092e',
+ 'info_dict': {
+ 'id': 'p3cThlMjE61VDi_SD9JlIteSNPWVDBB9',
+ 'ext': 'mp4',
+ 'title': 'Buddy Hield vs Steph Curry (and the world)',
+ 'description': 'Let’s dissect only the most important Final Four storylines.',
+ },
+ }, {
+ 'md5': 'bf0c5cc115636af028be1bab79217ea9',
+ 'info_dict': {
+ 'id': 'BmbmVjMjE6esPHxdALGubTrouQ0jYLHj',
+ 'ext': 'mp4',
+ 'title': 'Chasing Cinderella 2016: Syracuse basketball',
+ 'description': 'md5:e02d56b026d51aa32c010676765a690d',
+ },
+ }],
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = compat_urllib_parse_unquote(self._download_webpage(url, display_id))
+
+ def create_entry(provider_video_id, provider_video_type, title=None, description=None):
+ return {
+ '_type': 'url_transparent',
+ 'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id),
+ 'title': title or self._og_search_title(webpage),
+ 'description': description or self._og_search_description(webpage),
+ }
+
+ entries = []
+ entries_data = self._search_regex([
+ r'Chorus\.VideoContext\.addVideo\((\[{.+}\])\);',
+ r'var\s+entry\s*=\s*({.+});',
+ r'SBN\.VideoLinkset\.entryGroup\(\s*(\[.+\])',
+ ], webpage, 'video data', default=None)
+ if entries_data:
+ entries_data = self._parse_json(entries_data, display_id)
+ if isinstance(entries_data, dict):
+ entries_data = [entries_data]
+ for video_data in entries_data:
+ provider_video_id = video_data.get('provider_video_id')
+ provider_video_type = video_data.get('provider_video_type')
+ if provider_video_id and provider_video_type:
+ entries.append(create_entry(
+ provider_video_id, provider_video_type,
+ video_data.get('title'), video_data.get('description')))
+
+ provider_video_id = self._search_regex(
+ r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None)
+ if provider_video_id:
+ entries.append(create_entry(provider_video_id, 'ooyala'))
+
+ volume_uuid = self._search_regex(
+ r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid', default=None)
+ if volume_uuid:
+ volume_webpage = self._download_webpage(
+ 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid)
+ video_data = self._parse_json(self._search_regex(
+ r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)
+ for provider_video_type in ('ooyala', 'youtube'):
+ provider_video_id = video_data.get('%s_id' % provider_video_type)
+ if provider_video_id:
+ description = video_data.get('description_long') or video_data.get('description_short')
+ entries.append(create_entry(
+ provider_video_id, provider_video_type, video_data.get('title_short'), description))
+ break
+
+ if len(entries) == 1:
+ return entries[0]
+ else:
+ return self.playlist_result(entries, display_id, self._og_search_title(webpage), self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py
index 92c90e5..1557a0e 100644
--- a/youtube_dl/extractor/vporn.py
+++ b/youtube_dl/extractor/vporn.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
parse_duration,
str_to_int,
)
@@ -27,7 +28,8 @@ class VpornIE(InfoExtractor):
'duration': 393,
'age_limit': 18,
'view_count': int,
- }
+ },
+ 'skip': 'video removed',
},
{
'url': 'http://www.vporn.com/female/hana-shower/523564/',
@@ -40,7 +42,7 @@ class VpornIE(InfoExtractor):
'description': 'Hana showers at the bathroom.',
'thumbnail': 're:^https?://.*\.jpg$',
'uploader': 'Hmmmmm',
- 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female'],
+ 'categories': ['Big Boobs', 'Erotic', 'Teen', 'Female', '720p'],
'duration': 588,
'age_limit': 18,
'view_count': int,
@@ -55,6 +57,10 @@ class VpornIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
+ errmsg = 'This video has been deleted due to Copyright Infringement or by the account owner!'
+ if errmsg in webpage:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True)
+
title = self._html_search_regex(
r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip()
description = self._html_search_regex(
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
index 2b6bae8..8e35f24 100644
--- a/youtube_dl/extractor/vrt.py
+++ b/youtube_dl/extractor/vrt.py
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import float_or_none
+from ..utils import (
+ determine_ext,
+ float_or_none,
+)
class VRTIE(InfoExtractor):
@@ -53,6 +56,11 @@ class VRTIE(InfoExtractor):
}
},
{
+ # YouTube video
+ 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957',
+ 'only_matching': True,
+ },
+ {
'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055',
'only_matching': True,
}
@@ -66,7 +74,17 @@ class VRTIE(InfoExtractor):
video_id = self._search_regex(
r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False)
+ src = self._search_regex(
+ r'data-video-src="([^"]+)"', webpage, 'video src', default=None)
+
+ video_type = self._search_regex(
+ r'data-video-type="([^"]+)"', webpage, 'video type', default=None)
+
+ if video_type == 'YouTubeVideo':
+ return self.url_result(src, 'Youtube')
+
formats = []
+
mobj = re.search(
r'data-video-iphone-server="(?P<server>[^"]+)"\s+data-video-iphone-path="(?P<path>[^"]+)"',
webpage)
@@ -74,11 +92,15 @@ class VRTIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats(
'%s/%s' % (mobj.group('server'), mobj.group('path')),
video_id, 'mp4', m3u8_id='hls', fatal=False))
- mobj = re.search(r'data-video-src="(?P<src>[^"]+)"', webpage)
- if mobj:
- formats.extend(self._extract_f4m_formats(
- '%s/manifest.f4m' % mobj.group('src'),
- video_id, f4m_id='hds', fatal=False))
+
+ if src:
+ if determine_ext(src) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.extend(self._extract_f4m_formats(
+ '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False))
if not formats and 'data-video-geoblocking="true"' in webpage:
self.raise_geo_restricted('This video is only available in Belgium')
diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py
index 149e364..10ca6ac 100644
--- a/youtube_dl/extractor/vube.py
+++ b/youtube_dl/extractor/vube.py
@@ -15,7 +15,7 @@ from ..utils import (
class VubeIE(InfoExtractor):
IE_NAME = 'vube'
IE_DESC = 'Vube.com'
- _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b'
+ _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b'
_TESTS = [
{
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
index a6d9b5f..eaa888f 100644
--- a/youtube_dl/extractor/vuclip.py
+++ b/youtube_dl/extractor/vuclip.py
@@ -14,7 +14,7 @@ from ..utils import (
class VuClipIE(InfoExtractor):
- _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html',
diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py
deleted file mode 100644
index faa167e..0000000
--- a/youtube_dl/extractor/vulture.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-import os.path
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_iso8601,
-)
-
-
-class VultureIE(InfoExtractor):
- IE_NAME = 'vulture.com'
- _VALID_URL = r'https?://video\.vulture\.com/video/(?P<display_id>[^/]+)/'
- _TEST = {
- 'url': 'http://video.vulture.com/video/Mindy-Kaling-s-Harvard-Speech/player?layout=compact&read_more=1',
- 'md5': '8d997845642a2b5152820f7257871bc8',
- 'info_dict': {
- 'id': '6GHRQL3RV7MSD1H4',
- 'ext': 'mp4',
- 'title': 'kaling-speech-2-MAGNIFY STANDARD CONTAINER REVISED',
- 'uploader_id': 'Sarah',
- 'thumbnail': 're:^http://.*\.jpg$',
- 'timestamp': 1401288564,
- 'upload_date': '20140528',
- 'description': 'Uplifting and witty, as predicted.',
- 'duration': 1015,
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
-
- webpage = self._download_webpage(url, display_id)
- query_string = self._search_regex(
- r"queryString\s*=\s*'([^']+)'", webpage, 'query string')
- video_id = self._search_regex(
- r'content=([^&]+)', query_string, 'video ID')
- query_url = 'http://video.vulture.com/embed/player/container/1000/1000/?%s' % query_string
-
- query_webpage = self._download_webpage(
- query_url, display_id, note='Downloading query page')
- params_json = self._search_regex(
- r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n',
- query_webpage,
- 'player params')
- params = json.loads(params_json)
-
- upload_timestamp = parse_iso8601(params['posted'].replace(' ', 'T'))
- uploader_id = params.get('user', {}).get('handle')
-
- media_item = params['media_item']
- title = os.path.splitext(media_item['title'])[0]
- duration = int_or_none(media_item.get('duration_seconds'))
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'url': media_item['pipeline_xid'],
- 'title': title,
- 'timestamp': upload_timestamp,
- 'thumbnail': params.get('thumbnail_url'),
- 'uploader_id': uploader_id,
- 'description': params.get('description'),
- 'duration': duration,
- }
diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py
index 24efbd6..8b94883 100644
--- a/youtube_dl/extractor/walla.py
+++ b/youtube_dl/extractor/walla.py
@@ -11,7 +11,7 @@ from ..utils import (
class WallaIE(InfoExtractor):
- _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
+ _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)'
_TEST = {
'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one',
'info_dict': {
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index ec8b999..839cad9 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -11,7 +11,96 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+ IE_NAME = 'washingtonpost'
+ _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _TEST = {
+ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+ 'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
+ 'info_dict': {
+ 'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
+ 'ext': 'mp4',
+ 'title': 'Egypt finds belongings, debris from plane crash',
+ 'description': 'md5:a17ceee432f215a5371388c1f680bd86',
+ 'upload_date': '20160520',
+ 'uploader': 'Reuters',
+ 'timestamp': 1463778452,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_data = self._download_json(
+ 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
+ video_id, transform_source=strip_jsonp)[0]['contentConfig']
+ title = video_data['title']
+
+ urls = []
+ formats = []
+ for s in video_data.get('streams', []):
+ s_url = s.get('url')
+ if not s_url or s_url in urls:
+ continue
+ urls.append(s_url)
+ video_type = s.get('type')
+ if video_type == 'smil':
+ continue
+ elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
+ m3u8_formats = self._extract_m3u8_formats(
+ s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ for m3u8_format in m3u8_formats:
+ width = m3u8_format.get('width')
+ if not width:
+ continue
+ vbr = self._search_regex(
+ r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
+ if vbr:
+ m3u8_format.update({
+ 'vbr': int_or_none(vbr),
+ })
+ formats.extend(m3u8_formats)
+ else:
+ width = int_or_none(s.get('width'))
+ vbr = int_or_none(s.get('bitrate'))
+ has_width = width != 0
+ formats.append({
+ 'format_id': (
+ '%s-%d-%d' % (video_type, width, vbr)
+ if width
+ else video_type),
+ 'vbr': vbr if has_width else None,
+ 'width': width,
+ 'height': int_or_none(s.get('height')),
+ 'acodec': s.get('audioCodec'),
+ 'vcodec': s.get('videoCodec') if has_width else 'none',
+ 'filesize': int_or_none(s.get('fileSize')),
+ 'url': s_url,
+ 'ext': 'mp4',
+ 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
+ })
+ source_media_url = video_data.get('sourceMediaURL')
+ if source_media_url:
+ formats.append({
+ 'format_id': 'source_media',
+ 'url': source_media_url,
+ })
+ self._sort_formats(
+ formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('blurb'),
+ 'uploader': video_data.get('credits', {}).get('source'),
+ 'formats': formats,
+ 'duration': int_or_none(video_data.get('videoDuration'), 100),
+ 'timestamp': int_or_none(
+ video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
+ }
+
+
+class WashingtonPostArticleIE(InfoExtractor):
+ IE_NAME = 'washingtonpost:article'
+ _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
'info_dict': {
@@ -63,6 +152,10 @@ class WashingtonPostIE(InfoExtractor):
}]
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
+
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
@@ -74,54 +167,7 @@ class WashingtonPostIE(InfoExtractor):
<div\s+class="posttv-video-embed[^>]*?data-uuid=|
data-video-uuid=
)"([^"]+)"''', webpage)
- entries = []
- for i, uuid in enumerate(uuids, start=1):
- vinfo_all = self._download_json(
- 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
- page_id,
- transform_source=strip_jsonp,
- note='Downloading information of video %d/%d' % (i, len(uuids))
- )
- vinfo = vinfo_all[0]['contentConfig']
- uploader = vinfo.get('credits', {}).get('source')
- timestamp = int_or_none(
- vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
-
- formats = [{
- 'format_id': (
- '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
- if s.get('width')
- else s.get('type')),
- 'vbr': s.get('bitrate') if s.get('width') != 0 else None,
- 'width': s.get('width'),
- 'height': s.get('height'),
- 'acodec': s.get('audioCodec'),
- 'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
- 'filesize': s.get('fileSize'),
- 'url': s.get('url'),
- 'ext': 'mp4',
- 'preference': -100 if s.get('type') == 'smil' else None,
- 'protocol': {
- 'MP4': 'http',
- 'F4F': 'f4m',
- }.get(s.get('type')),
- } for s in vinfo.get('streams', [])]
- source_media_url = vinfo.get('sourceMediaURL')
- if source_media_url:
- formats.append({
- 'format_id': 'source_media',
- 'url': source_media_url,
- })
- self._sort_formats(formats)
- entries.append({
- 'id': uuid,
- 'title': vinfo['title'],
- 'description': vinfo.get('blurb'),
- 'uploader': uploader,
- 'formats': formats,
- 'duration': int_or_none(vinfo.get('videoDuration'), 100),
- 'timestamp': timestamp,
- })
+ entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
return {
'_type': 'playlist',
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 37cf3d3..de7d6b5 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -2,25 +2,26 @@
from __future__ import unicode_literals
import re
-import hashlib
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
ExtractorError,
unified_strdate,
+ HEADRequest,
+ float_or_none,
)
class WatIE(InfoExtractor):
- _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)'
+ _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
IE_NAME = 'wat.tv'
_TESTS = [
{
'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
- 'md5': 'ce70e9223945ed26a8056d413ca55dc9',
+ 'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
'info_dict': {
'id': '11713067',
- 'display_id': 'soupe-figues-l-orange-aux-epices',
'ext': 'mp4',
'title': 'Soupe de figues à l\'orange et aux épices',
'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
@@ -33,7 +34,6 @@ class WatIE(InfoExtractor):
'md5': 'fbc84e4378165278e743956d9c1bf16b',
'info_dict': {
'id': '11713075',
- 'display_id': 'gregory-lemarchal-voix-ange',
'ext': 'mp4',
'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3',
@@ -44,96 +44,85 @@ class WatIE(InfoExtractor):
},
]
- def download_video_info(self, real_id):
- # 'contentv4' is used in the website, but it also returns the related
- # videos, we don't need them
- info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
- return info['media']
-
def _real_extract(self, url):
- def real_id_for_chapter(chapter):
- return chapter['tc_start'].split('-')[0]
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
- real_id = mobj.group('real_id')
- if not real_id:
- short_id = mobj.group('short_id')
- webpage = self._download_webpage(url, display_id or short_id)
- real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
+ video_id = self._match_id(url)
+ video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
- video_info = self.download_video_info(real_id)
+ # 'contentv4' is used in the website, but it also returns the related
+ # videos, we don't need them
+ video_info = self._download_json(
+ 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media']
error_desc = video_info.get('error_desc')
if error_desc:
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, error_desc), expected=True)
- geo_list = video_info.get('geoList')
- country = geo_list[0] if geo_list else ''
-
chapters = video_info['chapters']
first_chapter = chapters[0]
- files = video_info['files']
- first_file = files[0]
- if real_id_for_chapter(first_chapter) != real_id:
- self.to_screen('Multipart video detected')
- chapter_urls = []
- for chapter in chapters:
- chapter_id = real_id_for_chapter(chapter)
- # Yes, when we this chapter is processed by WatIE,
- # it will download the info again
- chapter_info = self.download_video_info(chapter_id)
- chapter_urls.append(chapter_info['url'])
- entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
- return self.playlist_result(entries, real_id, video_info['title'])
+ def video_id_for_chapter(chapter):
+ return chapter['tc_start'].split('-')[0]
- upload_date = None
- if 'date_diffusion' in first_chapter:
- upload_date = unified_strdate(first_chapter['date_diffusion'])
+ if video_id_for_chapter(first_chapter) != video_id:
+ self.to_screen('Multipart video detected')
+ entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
+ return self.playlist_result(entries, video_id, video_info['title'])
# Otherwise we can continue and extract just one part, we have to use
- # the short id for getting the video url
-
- formats = [{
- 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
- 'format_id': 'Mobile',
- }]
-
- fmts = [('SD', 'web')]
- if first_file.get('hasHD'):
- fmts.append(('HD', 'webhd'))
-
- def compute_token(param):
- timestamp = '%08x' % int(self._download_webpage(
- 'http://www.wat.tv/servertime', real_id,
- 'Downloading server time').split('|')[0])
- magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564'
- return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp)
-
- for fmt in fmts:
- webid = '/%s/%s' % (fmt[1], real_id)
- video_url = self._download_webpage(
- 'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country),
- real_id,
- 'Downloading %s video URL' % fmt[0],
- 'Failed to download %s video URL' % fmt[0],
- False)
- if not video_url:
+ # the video id for getting the video url
+
+ date_diffusion = first_chapter.get('date_diffusion')
+ upload_date = unified_strdate(date_diffusion) if date_diffusion else None
+
+ def extract_url(path_template, url_type):
+ req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
+ head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type)
+ red_url = head.geturl()
+ if req_url == red_url:
+ raise ExtractorError(
+ '%s said: Sorry, this video is not available from your country.' % self.IE_NAME,
+ expected=True)
+ return red_url
+
+ m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
+ http_url = extract_url('android5/%s.mp4', 'http')
+
+ formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+ formats.extend(m3u8_formats)
+ formats.extend(self._extract_f4m_formats(
+ m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
+ video_id, f4m_id='hds', fatal=False))
+ for m3u8_format in m3u8_formats:
+ mobj = re.search(
+ r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url'])
+ if not mobj:
continue
- formats.append({
- 'url': video_url,
- 'ext': 'mp4',
- 'format_id': fmt[0],
+ abr, vbr = mobj.groups()
+ abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
+ m3u8_format.update({
+ 'vbr': vbr,
+ 'abr': abr,
+ })
+ if not vbr or not abr:
+ continue
+ f = m3u8_format.copy()
+ f.update({
+ 'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url),
+ 'format_id': f['format_id'].replace('hls', 'http'),
+ 'protocol': 'http',
})
+ formats.append(f)
+ self._sort_formats(formats)
return {
- 'id': real_id,
- 'display_id': display_id,
+ 'id': video_id,
'title': first_chapter['title'],
'thumbnail': first_chapter['preview'],
'description': first_chapter['description'],
'view_count': video_info['views'],
'upload_date': upload_date,
- 'duration': first_file['duration'],
+ 'duration': video_info['files'][0]['duration'],
'formats': formats,
}
diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/watchindianporn.py
index e334836..5d3b5bd 100644
--- a/youtube_dl/extractor/sexykarma.py
+++ b/youtube_dl/extractor/watchindianporn.py
@@ -11,61 +11,27 @@ from ..utils import (
)
-class SexyKarmaIE(InfoExtractor):
- IE_DESC = 'Sexy Karma and Watch Indian Porn'
- _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html'
- _TESTS = [{
- 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html',
- 'md5': 'b9798e7d1ef1765116a8f516c8091dbd',
+class WatchIndianPornIE(InfoExtractor):
+ IE_DESC = 'Watch Indian Porn'
+ _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html'
+ _TEST = {
+ 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html',
+ 'md5': '249589a164dde236ec65832bfce17440',
'info_dict': {
- 'id': 'yHI70cOyIHt',
- 'display_id': 'taking-a-quick-pee',
+ 'id': 'RZa2avywNPa',
+ 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera',
'ext': 'mp4',
- 'title': 'Taking a quick pee.',
+ 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera',
'thumbnail': 're:^https?://.*\.jpg$',
- 'uploader': 'wildginger7',
- 'upload_date': '20141008',
- 'duration': 22,
+ 'uploader': 'LoveJay',
+ 'upload_date': '20160428',
+ 'duration': 226,
'view_count': int,
'comment_count': int,
'categories': list,
'age_limit': 18,
}
- }, {
- 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html',
- 'md5': 'dd216c68d29b49b12842b9babe762a5d',
- 'info_dict': {
- 'id': '8Id6EZPbuHf',
- 'display_id': 'pot-pixie-tribute',
- 'ext': 'mp4',
- 'title': 'pot_pixie tribute',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'uploader': 'banffite',
- 'upload_date': '20141013',
- 'duration': 16,
- 'view_count': int,
- 'comment_count': int,
- 'categories': list,
- 'age_limit': 18,
- }
- }, {
- 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html',
- 'md5': '9afb80675550406ed9a63ac2819ef69d',
- 'info_dict': {
- 'id': 'dW2mtctxJfs',
- 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number',
- 'ext': 'mp4',
- 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'uploader': 'Don',
- 'upload_date': '20140213',
- 'duration': 83,
- 'view_count': int,
- 'comment_count': int,
- 'categories': list,
- 'age_limit': 18,
- }
- }]
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -109,6 +75,9 @@ class SexyKarmaIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'url': video_url,
+ 'http_headers': {
+ 'Referer': url,
+ },
'title': title,
'thumbnail': thumbnail,
'uploader': uploader,
diff --git a/youtube_dl/extractor/wayofthemaster.py b/youtube_dl/extractor/wayofthemaster.py
deleted file mode 100644
index af7bb8b..0000000
--- a/youtube_dl/extractor/wayofthemaster.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-
-
-class WayOfTheMasterIE(InfoExtractor):
- _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P<id>[^/?#]+)\.s?html(?:$|[?#])'
-
- _TEST = {
- 'url': 'http://www.wayofthemaster.com/hbks.shtml',
- 'md5': '5316b57487ada8480606a93cb3d18d24',
- 'info_dict': {
- 'id': 'hbks',
- 'ext': 'mp4',
- 'title': 'Intelligent Design vs. Evolution',
- },
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
-
- webpage = self._download_webpage(url, video_id)
-
- title = self._search_regex(
- r'<img src="images/title_[^"]+".*?alt="([^"]+)"',
- webpage, 'title', default=None)
- if title is None:
- title = self._html_search_regex(
- r'<title>(.*?)</title>', webpage, 'page title')
-
- url_base = self._search_regex(
- r'<param\s+name="?movie"?\s+value=".*?/wotm_videoplayer_highlow[0-9]*\.swf\?vid=([^"]+)"',
- webpage, 'URL base')
- formats = [{
- 'format_id': 'low',
- 'quality': 1,
- 'url': url_base + '_low.mp4',
- }, {
- 'format_id': 'high',
- 'quality': 2,
- 'url': url_base + '_high.mp4',
- }]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
index a851578..390f9e8 100644
--- a/youtube_dl/extractor/wdr.py
+++ b/youtube_dl/extractor/wdr.py
@@ -1,215 +1,236 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
-import itertools
import re
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urlparse,
-)
from ..utils import (
+ determine_ext,
+ ExtractorError,
+ js_to_json,
+ strip_jsonp,
unified_strdate,
- qualities,
+ update_url_query,
+ urlhandle_detect_ext,
)
-class WDRIE(InfoExtractor):
- _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
- _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+class WDRBaseIE(InfoExtractor):
+ def _extract_wdr_video(self, webpage, display_id):
+ # for wdr.de the data-extension is in a tag with the class "mediaLink"
+ # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn"
+ # for wdrmaus its in a link to the page in a multiline "videoLink"-tag
+ json_metadata = self._html_search_regex(
+ r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',
+ webpage, 'media link', default=None, flags=re.MULTILINE)
+
+ if not json_metadata:
+ return
+
+ media_link_obj = self._parse_json(json_metadata, display_id,
+ transform_source=js_to_json)
+ jsonp_url = media_link_obj['mediaObj']['url']
+
+ metadata = self._download_json(
+ jsonp_url, 'metadata', transform_source=strip_jsonp)
+
+ metadata_tracker_data = metadata['trackerData']
+ metadata_media_resource = metadata['mediaResource']
+
+ formats = []
+
+ # check if the metadata contains a direct URL to a file
+ for kind, media_resource in metadata_media_resource.items():
+ if kind not in ('dflt', 'alt'):
+ continue
+
+ for tag_name, medium_url in media_resource.items():
+ if tag_name not in ('videoURL', 'audioURL'):
+ continue
+
+ ext = determine_ext(medium_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ medium_url, display_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls'))
+ elif ext == 'f4m':
+ manifest_url = update_url_query(
+ medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'})
+ formats.extend(self._extract_f4m_formats(
+ manifest_url, display_id, f4m_id='hds', fatal=False))
+ elif ext == 'smil':
+ formats.extend(self._extract_smil_formats(
+ medium_url, 'stream', fatal=False))
+ else:
+ a_format = {
+ 'url': medium_url
+ }
+ if ext == 'unknown_video':
+ urlh = self._request_webpage(
+ medium_url, display_id, note='Determining extension')
+ ext = urlhandle_detect_ext(urlh)
+ a_format['ext'] = ext
+ formats.append(a_format)
+
+ self._sort_formats(formats)
+
+ subtitles = {}
+ caption_url = metadata_media_resource.get('captionURL')
+ if caption_url:
+ subtitles['de'] = [{
+ 'url': caption_url,
+ 'ext': 'ttml',
+ }]
+
+ title = metadata_tracker_data['trackerClipTitle']
+
+ return {
+ 'id': metadata_tracker_data.get('trackerClipId', display_id),
+ 'display_id': display_id,
+ 'title': title,
+ 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')),
+ }
+
+
+class WDRIE(WDRBaseIE):
+ _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5'
+ _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P<type>[^/]+)/(?P<display_id>.+)\.html'
+ _VALID_URL = r'(?P<page_url>https?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL
_TESTS = [
{
- 'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html',
+ 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html',
+ # HDS download, MD5 is unstable
'info_dict': {
- 'id': 'mdb-362427',
+ 'id': 'mdb-1058683',
'ext': 'flv',
- 'title': 'Servicezeit',
- 'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb',
- 'upload_date': '20140310',
- 'is_live': False
+ 'display_id': 'doku-am-freitag/video-geheimnis-aachener-dom-100',
+ 'title': 'Geheimnis Aachener Dom',
+ 'alt_title': 'Doku am Freitag',
+ 'upload_date': '20160304',
+ 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318',
+ 'is_live': False,
+ 'subtitles': {'de': [{
+ 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml',
+ 'ext': 'ttml',
+ }]},
},
- 'params': {
- 'skip_download': True,
+ },
+ {
+ 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html',
+ 'md5': 'f4c1f96d01cf285240f53ea4309663d8',
+ 'info_dict': {
+ 'id': 'mdb-1072000',
+ 'ext': 'mp3',
+ 'display_id': 'wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100',
+ 'title': 'Schriftstellerin Juli Zeh',
+ 'alt_title': 'WDR 3 Gespräch am Samstag',
+ 'upload_date': '20160312',
+ 'description': 'md5:e127d320bc2b1f149be697ce044a3dd7',
+ 'is_live': False,
+ 'subtitles': {}
},
- 'skip': 'Page Not Found',
},
{
- 'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html',
+ 'url': 'http://www1.wdr.de/mediathek/video/live/index.html',
'info_dict': {
- 'id': 'mdb-363194',
- 'ext': 'flv',
- 'title': 'Marga Spiegel ist tot',
- 'description': 'md5:2309992a6716c347891c045be50992e4',
- 'upload_date': '20140311',
- 'is_live': False
+ 'id': 'mdb-103364',
+ 'ext': 'mp4',
+ 'display_id': 'index',
+ 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'alt_title': 'WDR Fernsehen Live',
+ 'upload_date': None,
+ 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+ 'is_live': True,
+ 'subtitles': {}
},
'params': {
- 'skip_download': True,
+ 'skip_download': True, # m3u8 download
},
- 'skip': 'Page Not Found',
},
{
- 'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html',
- 'md5': '83e9e8fefad36f357278759870805898',
+ 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html',
+ 'playlist_mincount': 8,
'info_dict': {
- 'id': 'mdb-194332',
- 'ext': 'mp3',
- 'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
- 'description': 'md5:2309992a6716c347891c045be50992e4',
- 'upload_date': '20091129',
- 'is_live': False
+ 'id': 'aktuelle-stunde/aktuelle-stunde-120',
},
},
{
- 'url': 'http://www.funkhauseuropa.de/av/audioflaviacoelhoamaramar100-audioplayer.html',
- 'md5': '99a1443ff29af19f6c52cf6f4dc1f4aa',
+ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
'info_dict': {
- 'id': 'mdb-478135',
- 'ext': 'mp3',
- 'title': 'Flavia Coelho: Amar é Amar',
- 'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
- 'upload_date': '20140717',
- 'is_live': False
+ 'id': 'mdb-1096487',
+ 'ext': 'flv',
+ 'upload_date': 're:^[0-9]{8}$',
+ 'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$',
+ 'description': '- Die Sendung mit der Maus -',
},
- 'skip': 'Page Not Found',
+ 'skip': 'The id changes from week to week because of the new episode'
},
{
- 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html',
- 'playlist_mincount': 146,
+ 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5',
+ 'md5': '803138901f6368ee497b4d195bb164f2',
'info_dict': {
- 'id': 'mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100',
- }
+ 'id': 'mdb-186083',
+ 'ext': 'mp4',
+ 'upload_date': '20130919',
+ 'title': 'Sachgeschichte - Achterbahn ',
+ 'description': '- Die Sendung mit der Maus -',
+ },
},
{
- 'url': 'http://www1.wdr.de/mediathek/video/livestream/index.html',
+ 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html',
+ # Live stream, MD5 unstable
'info_dict': {
- 'id': 'mdb-103364',
- 'title': 're:^WDR Fernsehen Live [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9',
+ 'id': 'mdb-869971',
'ext': 'flv',
- 'upload_date': '20150101',
- 'is_live': True
- },
- 'params': {
- 'skip_download': True,
+ 'title': 'Funkhaus Europa Livestream',
+ 'description': 'md5:2309992a6716c347891c045be50992e4',
+ 'upload_date': '20160101',
},
}
]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- page_url = mobj.group('url')
- page_id = mobj.group('id')
+ url_type = mobj.group('type')
+ page_url = mobj.group('page_url')
+ display_id = mobj.group('display_id')
+ webpage = self._download_webpage(url, display_id)
- webpage = self._download_webpage(url, page_id)
+ info_dict = self._extract_wdr_video(webpage, display_id)
- if mobj.group('player') is None:
+ if not info_dict:
entries = [
- self.url_result(page_url + href, 'WDR')
+ self.url_result(page_url + href[0], 'WDR')
for href in re.findall(
- r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX,
+ r'<a href="(%s)"[^>]+data-extension=' % self._PAGE_REGEX,
webpage)
]
if entries: # Playlist page
- return self.playlist_result(entries, page_id)
+ return self.playlist_result(entries, playlist_id=display_id)
- # Overview page
- entries = []
- for page_num in itertools.count(2):
- hrefs = re.findall(
- r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"',
- webpage)
- entries.extend(
- self.url_result(page_url + href, 'WDR')
- for href in hrefs)
- next_url_m = re.search(
- r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage)
- if not next_url_m:
- break
- next_url = page_url + next_url_m.group(1)
- webpage = self._download_webpage(
- next_url, page_id,
- note='Downloading playlist page %d' % page_num)
- return self.playlist_result(entries, page_id)
-
- flashvars = compat_parse_qs(self._html_search_regex(
- r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+ raise ExtractorError('No downloadable streams found', expected=True)
- page_id = flashvars['trackerClipId'][0]
- video_url = flashvars['dslSrc'][0]
- title = flashvars['trackerClipTitle'][0]
- thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
- is_live = flashvars.get('isLive', ['0'])[0] == '1'
+ is_live = url_type == 'live'
if is_live:
- title = self._live_title(title)
-
- if 'trackerClipAirTime' in flashvars:
- upload_date = flashvars['trackerClipAirTime'][0]
- else:
- upload_date = self._html_search_meta(
- 'DC.Date', webpage, 'upload date')
-
- if upload_date:
- upload_date = unified_strdate(upload_date)
-
- formats = []
- preference = qualities(['S', 'M', 'L', 'XL'])
-
- if video_url.endswith('.f4m'):
- formats.extend(self._extract_f4m_formats(
- video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', page_id,
- f4m_id='hds', fatal=False))
- elif video_url.endswith('.smil'):
- formats.extend(self._extract_smil_formats(
- video_url, page_id, False, {
- 'hdcore': '3.3.0',
- 'plugin': 'aasp-3.3.0.99.43',
- }))
- else:
- formats.append({
- 'url': video_url,
- 'http_headers': {
- 'User-Agent': 'mobile',
- },
+ info_dict.update({
+ 'title': self._live_title(info_dict['title']),
+ 'upload_date': None,
})
+ elif 'upload_date' not in info_dict:
+ info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date'))
- m3u8_url = self._search_regex(
- r'rel="adaptiv"[^>]+href="([^"]+)"',
- webpage, 'm3u8 url', default=None)
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, page_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ info_dict.update({
+ 'description': self._html_search_meta('Description', webpage),
+ 'is_live': is_live,
+ })
- direct_urls = re.findall(
- r'rel="web(S|M|L|XL)"[^>]+href="([^"]+)"', webpage)
- if direct_urls:
- for quality, video_url in direct_urls:
- formats.append({
- 'url': video_url,
- 'preference': preference(quality),
- 'http_headers': {
- 'User-Agent': 'mobile',
- },
- })
-
- self._sort_formats(formats)
-
- description = self._html_search_meta('Description', webpage, 'description')
-
- return {
- 'id': page_id,
- 'formats': formats,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- 'is_live': is_live
- }
+ return info_dict
class WDRMobileIE(InfoExtractor):
@@ -241,81 +262,3 @@ class WDRMobileIE(InfoExtractor):
'User-Agent': 'mobile',
},
}
-
-
-class WDRMausIE(InfoExtractor):
- _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
- IE_DESC = 'Sendung mit der Maus'
- _TESTS = [{
- 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
- 'info_dict': {
- 'id': 'aktuelle-sendung',
- 'ext': 'mp4',
- 'thumbnail': 're:^http://.+\.jpg',
- 'upload_date': 're:^[0-9]{8}$',
- 'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
- }
- }, {
- 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
- 'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
- 'info_dict': {
- 'id': '40_jahre_maus',
- 'ext': 'mp4',
- 'thumbnail': 're:^http://.+\.jpg',
- 'upload_date': '20131007',
- 'title': '12.03.2011 - 40 Jahre Maus',
- }
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
- param_code = self._html_search_regex(
- r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
-
- title_date = self._search_regex(
- r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
- webpage, 'air date')
- title_str = self._html_search_regex(
- r'<h1>(.*?)</h1>', webpage, 'title')
- title = '%s - %s' % (title_date, title_str)
- upload_date = unified_strdate(
- self._html_search_meta('dc.date', webpage))
-
- fields = compat_parse_qs(param_code)
- video_url = fields['firstVideo'][0]
- thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
-
- formats = [{
- 'format_id': 'rtmp',
- 'url': video_url,
- }]
-
- jscode = self._download_webpage(
- 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
- video_id, fatal=False,
- note='Downloading URL translation table',
- errnote='Could not download URL translation table')
- if jscode:
- for m in re.finditer(
- r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
- jscode):
- if video_url.startswith(m.group('stream')):
- http_url = video_url.replace(
- m.group('stream'), m.group('dl'))
- formats.append({
- 'format_id': 'http',
- 'url': http_url,
- })
- break
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'upload_date': upload_date,
- }
diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py
index 2037d9b..7aea47e 100644
--- a/youtube_dl/extractor/webofstories.py
+++ b/youtube_dl/extractor/webofstories.py
@@ -12,38 +12,52 @@ class WebOfStoriesIE(InfoExtractor):
_VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/'
_GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/'
_USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/'
- _TESTS = [
- {
- 'url': 'http://www.webofstories.com/play/hans.bethe/71',
- 'md5': '373e4dd915f60cfe3116322642ddf364',
- 'info_dict': {
- 'id': '4536',
- 'ext': 'mp4',
- 'title': 'The temperature of the sun',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Hans Bethe talks about calculating the temperature of the sun',
- 'duration': 238,
- }
+ _TESTS = [{
+ 'url': 'http://www.webofstories.com/play/hans.bethe/71',
+ 'md5': '373e4dd915f60cfe3116322642ddf364',
+ 'info_dict': {
+ 'id': '4536',
+ 'ext': 'mp4',
+ 'title': 'The temperature of the sun',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Hans Bethe talks about calculating the temperature of the sun',
+ 'duration': 238,
+ }
+ }, {
+ 'url': 'http://www.webofstories.com/play/55908',
+ 'md5': '2985a698e1fe3211022422c4b5ed962c',
+ 'info_dict': {
+ 'id': '55908',
+ 'ext': 'mp4',
+ 'title': 'The story of Gemmata obscuriglobus',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
+ 'duration': 169,
+ },
+ 'skip': 'notfound',
+ }, {
+ # malformed og:title meta
+ 'url': 'http://www.webofstories.com/play/54215?o=MS',
+ 'info_dict': {
+ 'id': '54215',
+ 'ext': 'mp4',
+ 'title': '"A Leg to Stand On"',
+ 'thumbnail': 're:^https?://.*\.jpg$',
+ 'description': 'Oliver Sacks talks about the death and resurrection of a limb',
+ 'duration': 97,
},
- {
- 'url': 'http://www.webofstories.com/play/55908',
- 'md5': '2985a698e1fe3211022422c4b5ed962c',
- 'info_dict': {
- 'id': '55908',
- 'ext': 'mp4',
- 'title': 'The story of Gemmata obscuriglobus',
- 'thumbnail': 're:^https?://.*\.jpg$',
- 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus',
- 'duration': 169,
- }
+ 'params': {
+ 'skip_download': True,
},
- ]
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- title = self._og_search_title(webpage)
+ # Sometimes og:title meta is malformed
+ title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+ r'(?s)<strong>Title:\s*</strong>(.+?)<', webpage, 'title')
description = self._html_search_meta('description', webpage)
thumbnail = self._og_search_thumbnail(webpage)
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py
deleted file mode 100644
index 20bb039..0000000
--- a/youtube_dl/extractor/weibo.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-
-
-class WeiboIE(InfoExtractor):
- """
- The videos in Weibo come from different sites, this IE just finds the link
- to the external video and returns it.
- """
- _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
-
- _TEST = {
- 'url': 'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
- 'info_dict': {
- 'id': '98322879',
- 'ext': 'flv',
- 'title': '魔声耳机最新广告“All Eyes On Us”',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Sina'],
- }
-
- # Additional example videos from different sites
- # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm
- # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
- video_id = mobj.group('id')
- info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
- info = self._download_json(info_url, video_id)
-
- videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
- # Prefer sina video since they have thumbnails
- videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u)
- player_url = videos_urls[-1]
- m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html',
- player_url)
- if m_sina is not None:
- self.to_screen('Sina video detected')
- sina_id = m_sina.group(1)
- player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
- return self.url_result(player_url)
diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py
index e333ae3..3dafbee 100644
--- a/youtube_dl/extractor/weiqitv.py
+++ b/youtube_dl/extractor/weiqitv.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
class WeiqiTVIE(InfoExtractor):
IE_DESC = 'WQTV'
- _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)'
+ _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)'
_TESTS = [{
'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3',
diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py
index 041ff6c..54eb514 100644
--- a/youtube_dl/extractor/wimp.py
+++ b/youtube_dl/extractor/wimp.py
@@ -1,29 +1,33 @@
from __future__ import unicode_literals
-from .common import InfoExtractor
from .youtube import YoutubeIE
+from .jwplatform import JWPlatformBaseIE
-class WimpIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)'
+class WimpIE(JWPlatformBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)'
_TESTS = [{
- 'url': 'http://www.wimp.com/maruexhausted/',
+ 'url': 'http://www.wimp.com/maru-is-exhausted/',
'md5': 'ee21217ffd66d058e8b16be340b74883',
'info_dict': {
- 'id': 'maruexhausted',
+ 'id': 'maru-is-exhausted',
'ext': 'mp4',
'title': 'Maru is exhausted.',
'description': 'md5:57e099e857c0a4ea312542b684a869b8',
}
}, {
'url': 'http://www.wimp.com/clowncar/',
- 'md5': '4e2986c793694b55b37cf92521d12bb4',
+ 'md5': '5c31ad862a90dc5b1f023956faec13fe',
'info_dict': {
- 'id': 'clowncar',
- 'ext': 'mp4',
- 'title': 'It\'s like a clown car.',
- 'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',
+ 'id': 'cG4CEr2aiSg',
+ 'ext': 'webm',
+ 'title': 'Basset hound clown car...incredible!',
+ 'description': '5 of my Bassets crawled in this dog loo! www.bellinghambassets.com\n\nFor licensing/usage please contact: licensing(at)jukinmediadotcom',
+ 'upload_date': '20140303',
+ 'uploader': 'Gretchen Hoey',
+ 'uploader_id': 'gretchenandjeff1',
},
+ 'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
@@ -41,14 +45,13 @@ class WimpIE(InfoExtractor):
'ie_key': YoutubeIE.ie_key(),
}
- video_url = self._search_regex(
- r'<video[^>]+>\s*<source[^>]+src=(["\'])(?P<url>.+?)\1',
- webpage, 'video URL', group='url')
+ info_dict = self._extract_jwplayer_data(
+ webpage, video_id, require_title=False)
- return {
+ info_dict.update({
'id': video_id,
- 'url': video_url,
'title': self._og_search_title(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(webpage),
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index fdb16d9..c634b8d 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -3,63 +3,96 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- sanitized_Request,
+ int_or_none,
+ float_or_none,
)
class WistiaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)'
- _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json'
+ _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)'
+ _API_URL = 'http://fast.wistia.com/embed/medias/%s.json'
+ _IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s'
- _TEST = {
+ _TESTS = [{
'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
'md5': 'cafeb56ec0c53c18c97405eecb3133df',
'info_dict': {
'id': 'sh7fpupwlt',
'ext': 'mov',
'title': 'Being Resourceful',
+ 'description': 'a Clients From Hell Video Series video from worldwidewebhosting',
+ 'upload_date': '20131204',
+ 'timestamp': 1386185018,
'duration': 117,
},
- }
+ }, {
+ 'url': 'wistia:sh7fpupwlt',
+ 'only_matching': True,
+ }, {
+ # with hls video
+ 'url': 'wistia:807fafadvk',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- request = sanitized_Request(self._API_URL.format(video_id))
- request.add_header('Referer', url) # Some videos require this.
- data_json = self._download_json(request, video_id)
+ data_json = self._download_json(
+ self._API_URL % video_id, video_id,
+ # Some videos require this.
+ headers={
+ 'Referer': url if url.startswith('http') else self._IFRAME_URL % video_id,
+ })
+
if data_json.get('error'):
- raise ExtractorError('Error while getting the playlist',
- expected=True)
+ raise ExtractorError(
+ 'Error while getting the playlist', expected=True)
+
data = data_json['media']
+ title = data['name']
formats = []
thumbnails = []
- for atype, a in data['assets'].items():
- if atype == 'still':
- thumbnails.append({
- 'url': a['url'],
- 'resolution': '%dx%d' % (a['width'], a['height']),
- })
+ for a in data['assets']:
+ aurl = a.get('url')
+ if not aurl:
continue
- if atype == 'preview':
+ astatus = a.get('status')
+ atype = a.get('type')
+ if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
continue
- formats.append({
- 'format_id': atype,
- 'url': a['url'],
- 'width': a['width'],
- 'height': a['height'],
- 'filesize': a['size'],
- 'ext': a['ext'],
- 'preference': 1 if atype == 'original' else None,
- })
+ elif atype in ('still', 'still_image'):
+ thumbnails.append({
+ 'url': aurl,
+ 'width': int_or_none(a.get('width')),
+ 'height': int_or_none(a.get('height')),
+ })
+ else:
+ aext = a.get('ext')
+ is_m3u8 = a.get('container') == 'm3u8' or aext == 'm3u8'
+ formats.append({
+ 'format_id': atype,
+ 'url': aurl,
+ 'tbr': int_or_none(a.get('bitrate')),
+ 'vbr': int_or_none(a.get('opt_vbitrate')),
+ 'width': int_or_none(a.get('width')),
+ 'height': int_or_none(a.get('height')),
+ 'filesize': int_or_none(a.get('size')),
+ 'vcodec': a.get('codec'),
+ 'container': a.get('container'),
+ 'ext': 'mp4' if is_m3u8 else aext,
+ 'protocol': 'm3u8' if is_m3u8 else None,
+ 'preference': 1 if atype == 'original' else None,
+ })
self._sort_formats(formats)
return {
'id': video_id,
- 'title': data['name'],
+ 'title': title,
+ 'description': data.get('seoDescription'),
'formats': formats,
'thumbnails': thumbnails,
- 'duration': data.get('duration'),
+ 'duration': float_or_none(data.get('duration')),
+ 'timestamp': int_or_none(data.get('createdAt')),
}
diff --git a/youtube_dl/extractor/wrzuta.py b/youtube_dl/extractor/wrzuta.py
index c427649..bdd7097 100644
--- a/youtube_dl/extractor/wrzuta.py
+++ b/youtube_dl/extractor/wrzuta.py
@@ -5,8 +5,10 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
int_or_none,
qualities,
+ remove_start,
)
@@ -26,16 +28,17 @@ class WrzutaIE(InfoExtractor):
'uploader_id': 'laboratoriumdextera',
'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd',
},
+ 'skip': 'Redirected to wrzuta.pl',
}, {
- 'url': 'http://jolka85.wrzuta.pl/audio/063jOPX5ue2/liber_natalia_szroeder_-_teraz_ty',
- 'md5': 'bc78077859bea7bcfe4295d7d7fc9025',
+ 'url': 'http://vexling.wrzuta.pl/audio/01xBFabGXu6/james_horner_-_into_the_na_39_vi_world_bonus',
+ 'md5': 'f80564fb5a2ec6ec59705ae2bf2ba56d',
'info_dict': {
- 'id': '063jOPX5ue2',
- 'ext': 'ogg',
- 'title': 'Liber & Natalia Szroeder - Teraz Ty',
- 'duration': 203,
- 'uploader_id': 'jolka85',
- 'description': 'md5:2d2b6340f9188c8c4cd891580e481096',
+ 'id': '01xBFabGXu6',
+ 'ext': 'mp3',
+ 'title': 'James Horner - Into The Na\'vi World [Bonus]',
+ 'description': 'md5:30a70718b2cd9df3120fce4445b0263b',
+ 'duration': 95,
+ 'uploader_id': 'vexling',
},
}]
@@ -45,7 +48,10 @@ class WrzutaIE(InfoExtractor):
typ = mobj.group('typ')
uploader = mobj.group('uploader')
- webpage = self._download_webpage(url, video_id)
+ webpage, urlh = self._download_webpage_handle(url, video_id)
+
+ if urlh.geturl() == 'http://www.wrzuta.pl/':
+ raise ExtractorError('Video removed', expected=True)
quality = qualities(['SD', 'MQ', 'HQ', 'HD'])
@@ -80,3 +86,73 @@ class WrzutaIE(InfoExtractor):
'description': self._og_search_description(webpage),
'age_limit': embedpage.get('minimalAge', 0),
}
+
+
+class WrzutaPlaylistIE(InfoExtractor):
+ """
+ this class covers extraction of wrzuta playlist entries
+ the extraction process bases on following steps:
+ * collect information of playlist size
+ * download all entries provided on
+ the playlist webpage (the playlist is split
+ on two pages: first directly reached from webpage
+ second: downloaded on demand by ajax call and rendered
+ using the ajax call response)
+ * in case size of extracted entries not reached total number of entries
+ use the ajax call to collect the remaining entries
+ """
+
+ IE_NAME = 'wrzuta.pl:playlist'
+ _VALID_URL = r'https?://(?P<uploader>[0-9a-zA-Z]+)\.wrzuta\.pl/playlista/(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [{
+ 'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR/moja_muza',
+ 'playlist_mincount': 14,
+ 'info_dict': {
+ 'id': '7XfO4vE84iR',
+ 'title': 'Moja muza',
+ },
+ }, {
+ 'url': 'http://heroesf70.wrzuta.pl/playlista/6Nj3wQHx756/lipiec_-_lato_2015_muzyka_swiata',
+ 'playlist_mincount': 144,
+ 'info_dict': {
+ 'id': '6Nj3wQHx756',
+ 'title': 'Lipiec - Lato 2015 Muzyka Świata',
+ },
+ }, {
+ 'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ uploader = mobj.group('uploader')
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ playlist_size = int_or_none(self._html_search_regex(
+ (r'<div[^>]+class=["\']playlist-counter["\'][^>]*>\d+/(\d+)',
+ r'<div[^>]+class=["\']all-counter["\'][^>]*>(.+?)</div>'),
+ webpage, 'playlist size', default=None))
+
+ playlist_title = remove_start(
+ self._og_search_title(webpage), 'Playlista: ')
+
+ entries = []
+ if playlist_size:
+ entries = [
+ self.url_result(entry_url)
+ for _, entry_url in re.findall(
+ r'<a[^>]+href=(["\'])(http.+?)\1[^>]+class=["\']playlist-file-page',
+ webpage)]
+ if playlist_size > len(entries):
+ playlist_content = self._download_json(
+ 'http://%s.wrzuta.pl/xhr/get_playlist_offset/%s' % (uploader, playlist_id),
+ playlist_id,
+ 'Downloading playlist JSON',
+ 'Unable to download playlist JSON')
+ entries.extend([
+ self.url_result(entry['filelink'])
+ for entry in playlist_content.get('files', []) if entry.get('filelink')])
+
+ return self.playlist_result(entries, playlist_id, playlist_title)
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index 5a89737..a83e68b 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -4,16 +4,22 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ float_or_none,
unified_strdate,
)
class WSJIE(InfoExtractor):
- _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)'
+ _VALID_URL = r'''(?x)https?://
+ (?:
+ video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
+ (?:www\.)?wsj\.com/video/[^/]+/
+ )
+ (?P<id>[a-zA-Z0-9-]+)'''
IE_DESC = 'Wall Street Journal'
- _TEST = {
+ _TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
- 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9',
+ 'md5': 'e230a5bb249075e40793b655a54a02e4',
'info_dict': {
'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'ext': 'mp4',
@@ -24,65 +30,60 @@ class WSJIE(InfoExtractor):
'duration': 90,
'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
},
- }
+ }, {
+ 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- bitrates = [128, 174, 264, 320, 464, 664, 1264]
api_url = (
'http://video-api.wsj.com/api-video/find_all_videos.asp?'
- 'type=guid&count=1&query=%s&'
- 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,'
- 'author,description,name,linkURL,videoStillURL,duration,videoURL,'
- 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,'
- 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,'
- 'allthingsd-subsection,sm-section,sm-subsection,provider,'
- 'formattedCreationDate,keywords,keywordsOmniture,column,editor,'
- 'emailURL,emailPartnerID,showName,omnitureProgramName,'
- 'omnitureVideoFormat,linkRelativeURL,touchCastID,'
- 'omniturePublishDate,%s') % (
- video_id, ','.join('video%dkMP4Url' % br for br in bitrates))
+ 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
+ 'thumbnailList,author,description,name,duration,videoURL,'
+ 'titletag,formattedCreationDate,keywords,editor' % video_id)
info = self._download_json(api_url, video_id)['items'][0]
-
- # Thumbnails are conveniently in the correct format already
- thumbnails = info.get('thumbnailList')
- creator = info.get('author')
- uploader_id = info.get('editor')
- categories = info.get('keywords')
- duration = int_or_none(info.get('duration'))
- upload_date = unified_strdate(
- info.get('formattedCreationDate'), day_first=False)
title = info.get('name', info.get('titletag'))
- formats = [{
- 'format_id': 'f4m',
- 'format_note': 'f4m (meta URL)',
- 'url': info['videoURL'],
- }]
- if info.get('hls'):
+ formats = []
+
+ f4m_url = info.get('videoURL')
+ if f4m_url:
+ formats.extend(self._extract_f4m_formats(
+ f4m_url, video_id, f4m_id='hds', fatal=False))
+
+ m3u8_url = info.get('hls')
+ if m3u8_url:
formats.extend(self._extract_m3u8_formats(
info['hls'], video_id, ext='mp4',
- preference=0, entry_protocol='m3u8_native'))
- for br in bitrates:
- field = 'video%dkMP4Url' % br
- if info.get(field):
- formats.append({
- 'format_id': 'mp4-%d' % br,
- 'container': 'mp4',
- 'tbr': br,
- 'url': info[field],
- })
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+ for v in info.get('videoMP4List', []):
+ mp4_url = v.get('url')
+ if not mp4_url:
+ continue
+ tbr = int_or_none(v.get('bitrate'))
+ formats.append({
+ 'url': mp4_url,
+ 'format_id': 'http' + ('-%d' % tbr if tbr else ''),
+ 'tbr': tbr,
+ 'width': int_or_none(v.get('width')),
+ 'height': int_or_none(v.get('height')),
+ 'fps': float_or_none(v.get('fps')),
+ })
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
- 'thumbnails': thumbnails,
- 'creator': creator,
- 'uploader_id': uploader_id,
- 'duration': duration,
- 'upload_date': upload_date,
+ # Thumbnails are conveniently in the correct format already
+ 'thumbnails': info.get('thumbnailList'),
+ 'creator': info.get('author'),
+ 'uploader_id': info.get('editor'),
+ 'duration': int_or_none(info.get('duration')),
+ 'upload_date': unified_strdate(info.get(
+ 'formattedCreationDate'), day_first=False),
'title': title,
- 'categories': categories,
+ 'categories': info.get('keywords'),
}
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
index 4ff99e5..e4a2baa 100644
--- a/youtube_dl/extractor/xbef.py
+++ b/youtube_dl/extractor/xbef.py
@@ -5,7 +5,7 @@ from ..compat import compat_urllib_parse_unquote
class XBefIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking',
'md5': 'a478b565baff61634a98f5e5338be995',
diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py
index 236ff40..b113ab1 100644
--- a/youtube_dl/extractor/xboxclips.py
+++ b/youtube_dl/extractor/xboxclips.py
@@ -12,7 +12,7 @@ from ..utils import (
class XboxClipsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
_TEST = {
- 'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
+ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index a3236e6..995aada 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -4,31 +4,45 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse
from ..utils import (
+ decode_packed_codes,
ExtractorError,
- encode_dict,
int_or_none,
+ NO_DEFAULT,
sanitized_Request,
+ urlencode_postdata,
)
class XFileShareIE(InfoExtractor):
- IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me'
- _VALID_URL = r'''(?x)
- https?://(?P<host>(?:www\.)?
- (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me))/
- (?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?
- '''
-
- _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<'
+ _SITES = (
+ ('daclips.in', 'DaClips'),
+ ('filehoot.com', 'FileHoot'),
+ ('gorillavid.in', 'GorillaVid'),
+ ('movpod.in', 'MovPod'),
+ ('powerwatch.pw', 'PowerWatch'),
+ ('rapidvideo.ws', 'Rapidvideo.ws'),
+ ('thevideobee.to', 'TheVideoBee'),
+ ('vidto.me', 'Vidto'),
+ ('streamin.to', 'Streamin.To'),
+ ('xvidstage.com', 'XVIDSTAGE'),
+ )
+
+ IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
+ _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+ % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0]))
+
+ _FILE_NOT_FOUND_REGEXES = (
+ r'>(?:404 - )?File Not Found<',
+ r'>The file was removed by administrator<',
+ )
_TESTS = [{
'url': 'http://gorillavid.in/06y9juieqpmi',
'md5': '5ae4a3580620380619678ee4875893ba',
'info_dict': {
'id': '06y9juieqpmi',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ',
'thumbnail': 're:http://.*\.jpg',
},
@@ -45,25 +59,6 @@ class XFileShareIE(InfoExtractor):
'thumbnail': 're:http://.*\.jpg',
}
}, {
- # video with countdown timeout
- 'url': 'http://fastvideo.in/1qmdn1lmsmbw',
- 'md5': '8b87ec3f6564a3108a0e8e66594842ba',
- 'info_dict': {
- 'id': '1qmdn1lmsmbw',
- 'ext': 'mp4',
- 'title': 'Man of Steel - Trailer',
- 'thumbnail': 're:http://.*\.jpg',
- },
- }, {
- 'url': 'http://realvid.net/ctn2y6p2eviw',
- 'md5': 'b2166d2cf192efd6b6d764c18fd3710e',
- 'info_dict': {
- 'id': 'ctn2y6p2eviw',
- 'ext': 'flv',
- 'title': 'rdx 1955',
- 'thumbnail': 're:http://.*\.jpg',
- },
- }, {
'url': 'http://movpod.in/0wguyyxi1yca',
'only_matching': True,
}, {
@@ -73,7 +68,8 @@ class XFileShareIE(InfoExtractor):
'ext': 'mp4',
'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4',
'thumbnail': 're:http://.*\.jpg',
- }
+ },
+ 'skip': 'Video removed',
}, {
'url': 'http://vidto.me/ku5glz52nqe1.html',
'info_dict': {
@@ -81,6 +77,24 @@ class XFileShareIE(InfoExtractor):
'ext': 'mp4',
'title': 'test'
}
+ }, {
+ 'url': 'http://powerwatch.pw/duecjibvicbu',
+ 'info_dict': {
+ 'id': 'duecjibvicbu',
+ 'ext': 'mp4',
+ 'title': 'Big Buck Bunny trailer',
+ },
+ }, {
+ 'url': 'http://xvidstage.com/e0qcnl03co6z',
+ 'info_dict': {
+ 'id': 'e0qcnl03co6z',
+ 'ext': 'mp4',
+ 'title': 'Chucky Prank 2015.mp4',
+ },
+ }, {
+ # removed by administrator
+ 'url': 'http://xvidstage.com/amfy7atlkx25',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -90,7 +104,7 @@ class XFileShareIE(InfoExtractor):
url = 'http://%s/%s' % (mobj.group('host'), video_id)
webpage = self._download_webpage(url, video_id)
- if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None:
+ if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
raise ExtractorError('Video %s does not exist' % video_id, expected=True)
fields = self._hidden_inputs(webpage)
@@ -102,7 +116,7 @@ class XFileShareIE(InfoExtractor):
if countdown:
self._sleep(countdown, video_id)
- post = compat_urllib_parse.urlencode(encode_dict(fields))
+ post = urlencode_postdata(fields)
req = sanitized_Request(url, post)
req.add_header('Content-type', 'application/x-www-form-urlencoded')
@@ -112,13 +126,27 @@ class XFileShareIE(InfoExtractor):
title = (self._search_regex(
[r'style="z-index: [0-9]+;">([^<]+)</span>',
r'<td nowrap>([^<]+)</td>',
+ r'h4-fine[^>]*>([^<]+)<',
r'>Watch (.+) ',
r'<h2 class="video-page-head">([^<]+)</h2>'],
webpage, 'title', default=None) or self._og_search_title(webpage)).strip()
- video_url = self._search_regex(
- [r'file\s*:\s*["\'](http[^"\']+)["\'],',
- r'file_link\s*=\s*\'(https?:\/\/[0-9a-zA-z.\/\-_]+)'],
- webpage, 'file url')
+
+ def extract_video_url(default=NO_DEFAULT):
+ return self._search_regex(
+ (r'file\s*:\s*(["\'])(?P<url>http.+?)\1,',
+ r'file_link\s*=\s*(["\'])(?P<url>http.+?)\1',
+ r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http.+?)\2\)',
+ r'<embed[^>]+src=(["\'])(?P<url>http.+?)\1'),
+ webpage, 'file url', default=default, group='url')
+
+ video_url = extract_video_url(default=None)
+
+ if not video_url:
+ webpage = decode_packed_codes(self._search_regex(
+ r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))",
+ webpage, 'packed code'))
+ video_url = extract_video_url()
+
thumbnail = self._search_regex(
r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index fd43e88..bd8e1af 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ dict_get,
float_or_none,
int_or_none,
unified_strdate,
@@ -11,37 +12,52 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
- _TESTS = [
- {
- 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
- 'info_dict': {
- 'id': '1509445',
- 'ext': 'mp4',
- 'title': 'FemaleAgent Shy beauty takes the bait',
- 'upload_date': '20121014',
- 'uploader': 'Ruseful2011',
- 'duration': 893.52,
- 'age_limit': 18,
- }
+ _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?'
+ _TESTS = [{
+ 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
+ 'md5': '8281348b8d3c53d39fffb377d24eac4e',
+ 'info_dict': {
+ 'id': '1509445',
+ 'ext': 'mp4',
+ 'title': 'FemaleAgent Shy beauty takes the bait',
+ 'upload_date': '20121014',
+ 'uploader': 'Ruseful2011',
+ 'duration': 893.52,
+ 'age_limit': 18,
},
- {
- 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
- 'info_dict': {
- 'id': '2221348',
- 'ext': 'mp4',
- 'title': 'Britney Spears Sexy Booty',
- 'upload_date': '20130914',
- 'uploader': 'jojo747400',
- 'duration': 200.48,
- 'age_limit': 18,
- }
+ }, {
+ 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
+ 'info_dict': {
+ 'id': '2221348',
+ 'ext': 'mp4',
+ 'title': 'Britney Spears Sexy Booty',
+ 'upload_date': '20130914',
+ 'uploader': 'jojo747400',
+ 'duration': 200.48,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # empty seo
+ 'url': 'http://xhamster.com/movies/5667973/.html',
+ 'info_dict': {
+ 'id': '5667973',
+ 'ext': 'mp4',
+ 'title': '....',
+ 'upload_date': '20160208',
+ 'uploader': 'parejafree',
+ 'duration': 72.0,
+ 'age_limit': 18,
},
- {
- 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
- 'only_matching': True,
+ 'params': {
+ 'skip_download': True,
},
- ]
+ }, {
+ 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
def extract_video_url(webpage, name):
@@ -169,7 +185,13 @@ class XHamsterEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id,
- webpage, 'xhamster url')
+ r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id,
+ webpage, 'xhamster url', default=None)
+
+ if not video_url:
+ vars = self._parse_json(
+ self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'),
+ video_id)
+ video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl'))
return self.url_result(video_url, 'XHamster')
diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py
new file mode 100644
index 0000000..a6dfc4a
--- /dev/null
+++ b/youtube_dl/extractor/xiami.py
@@ -0,0 +1,168 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import int_or_none
+
+
+class XiamiBaseIE(InfoExtractor):
+ _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id'
+
+ def _download_webpage(self, *args, **kwargs):
+ webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs)
+ if '>Xiami is currently not available in your country.<' in webpage:
+ self.raise_geo_restricted('Xiami is currently not available in your country')
+
+ def _extract_track(self, track, track_id=None):
+ title = track['title']
+ track_url = self._decrypt(track['location'])
+
+ subtitles = {}
+ lyrics_url = track.get('lyric_url') or track.get('lyric')
+ if lyrics_url and lyrics_url.startswith('http'):
+ subtitles['origin'] = [{'url': lyrics_url}]
+
+ return {
+ 'id': track.get('song_id') or track_id,
+ 'url': track_url,
+ 'title': title,
+ 'thumbnail': track.get('pic') or track.get('album_pic'),
+ 'duration': int_or_none(track.get('length')),
+ 'creator': track.get('artist', '').split(';')[0],
+ 'track': title,
+ 'album': track.get('album_name'),
+ 'artist': track.get('artist'),
+ 'subtitles': subtitles,
+ }
+
+ def _extract_tracks(self, item_id, typ=None):
+ playlist = self._download_json(
+ '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id)
+ return [
+ self._extract_track(track, item_id)
+ for track in playlist['data']['trackList']]
+
+ @staticmethod
+ def _decrypt(origin):
+ n = int(origin[0])
+ origin = origin[1:]
+ short_lenth = len(origin) // n
+ long_num = len(origin) - short_lenth * n
+ l = tuple()
+ for i in range(0, n):
+ length = short_lenth
+ if i < long_num:
+ length += 1
+ l += (origin[0:length], )
+ origin = origin[length:]
+ ans = ''
+ for i in range(0, short_lenth + 1):
+ for j in range(0, n):
+ if len(l[j]) > i:
+ ans += l[j][i]
+ return compat_urllib_parse_unquote(ans).replace('^', '0')
+
+
+class XiamiSongIE(XiamiBaseIE):
+ IE_NAME = 'xiami:song'
+ IE_DESC = '虾米音乐'
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://www.xiami.com/song/1775610518',
+ 'md5': '521dd6bea40fd5c9c69f913c232cb57e',
+ 'info_dict': {
+ 'id': '1775610518',
+ 'ext': 'mp3',
+ 'title': 'Woman',
+ 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+ 'duration': 265,
+ 'creator': 'HONNE',
+ 'track': 'Woman',
+ 'album': 'Woman',
+ 'artist': 'HONNE',
+ 'subtitles': {
+ 'origin': [{
+ 'ext': 'lrc',
+ }],
+ },
+ },
+ 'skip': 'Georestricted',
+ }, {
+ 'url': 'http://www.xiami.com/song/1775256504',
+ 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
+ 'info_dict': {
+ 'id': '1775256504',
+ 'ext': 'mp3',
+ 'title': '悟空',
+ 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
+ 'duration': 200,
+ 'creator': '戴荃',
+ 'track': '悟空',
+ 'album': '悟空',
+ 'artist': '戴荃',
+ 'subtitles': {
+ 'origin': [{
+ 'ext': 'lrc',
+ }],
+ },
+ },
+ 'skip': 'Georestricted',
+ }]
+
+ def _real_extract(self, url):
+ return self._extract_tracks(self._match_id(url))[0]
+
+
+class XiamiPlaylistBaseIE(XiamiBaseIE):
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id)
+
+
+class XiamiAlbumIE(XiamiPlaylistBaseIE):
+ IE_NAME = 'xiami:album'
+ IE_DESC = '虾米音乐 - 专辑'
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[0-9]+)'
+ _TYPE = '1'
+ _TESTS = [{
+ 'url': 'http://www.xiami.com/album/2100300444',
+ 'info_dict': {
+ 'id': '2100300444',
+ },
+ 'playlist_count': 10,
+ 'skip': 'Georestricted',
+ }, {
+ 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
+ 'only_matching': True,
+ }]
+
+
+class XiamiArtistIE(XiamiPlaylistBaseIE):
+ IE_NAME = 'xiami:artist'
+ IE_DESC = '虾米音乐 - 歌手'
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[0-9]+)'
+ _TYPE = '2'
+ _TEST = {
+ 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
+ 'info_dict': {
+ 'id': '2132',
+ },
+ 'playlist_count': 20,
+ 'skip': 'Georestricted',
+ }
+
+
+class XiamiCollectionIE(XiamiPlaylistBaseIE):
+ IE_NAME = 'xiami:collection'
+ IE_DESC = '虾米音乐 - 精选集'
+ _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[0-9]+)'
+ _TYPE = '3'
+ _TEST = {
+ 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
+ 'info_dict': {
+ 'id': '156527391',
+ },
+ 'playlist_mincount': 29,
+ 'skip': 'Georestricted',
+ }
diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py
index 7c9d8af..36e5ead 100644
--- a/youtube_dl/extractor/xminus.py
+++ b/youtube_dl/extractor/xminus.py
@@ -2,15 +2,15 @@
from __future__ import unicode_literals
import re
+import time
from .common import InfoExtractor
from ..compat import (
- compat_chr,
compat_ord,
)
from ..utils import (
int_or_none,
- parse_filesize,
+ parse_duration,
)
@@ -22,7 +22,7 @@ class XMinusIE(InfoExtractor):
'info_dict': {
'id': '4542',
'ext': 'mp3',
- 'title': 'Леонид Агутин-Песенка шофера',
+ 'title': 'Леонид Агутин-Песенка шофёра',
'duration': 156,
'tbr': 320,
'filesize_approx': 5900000,
@@ -36,38 +36,41 @@ class XMinusIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
artist = self._html_search_regex(
- r'minus_track\.artist="(.+?)"', webpage, 'artist')
+ r'<a[^>]+href="/artist/\d+">([^<]+)</a>', webpage, 'artist')
title = artist + '-' + self._html_search_regex(
- r'minus_track\.title="(.+?)"', webpage, 'title')
- duration = int_or_none(self._html_search_regex(
- r'minus_track\.dur_sec=\'([0-9]*?)\'',
+ r'<span[^>]+class="minustrack-full-title(?:\s+[^"]+)?"[^>]*>([^<]+)', webpage, 'title')
+ duration = parse_duration(self._html_search_regex(
+ r'<span[^>]+class="player-duration(?:\s+[^"]+)?"[^>]*>([^<]+)',
webpage, 'duration', fatal=False))
- filesize_approx = parse_filesize(self._html_search_regex(
- r'<div id="finfo"[^>]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])',
- webpage, 'approximate filesize', fatal=False))
- tbr = int_or_none(self._html_search_regex(
- r'<div class="quality[^"]*"></div>\s*([0-9]+)\s*kbps',
- webpage, 'bitrate', fatal=False))
+ mobj = re.search(
+ r'<div[^>]+class="dw-info(?:\s+[^"]+)?"[^>]*>(?P<tbr>\d+)\s*кбит/c\s+(?P<filesize>[0-9.]+)\s*мб</div>',
+ webpage)
+ tbr = filesize_approx = None
+ if mobj:
+ filesize_approx = float(mobj.group('filesize')) * 1000000
+ tbr = float(mobj.group('tbr'))
view_count = int_or_none(self._html_search_regex(
- r'<div class="quality.*?► ([0-9]+)',
+ r'<span><[^>]+class="icon-chart-bar".*?>(\d+)</span>',
webpage, 'view count', fatal=False))
description = self._html_search_regex(
- r'(?s)<div id="song_texts">(.*?)</div><br',
+ r'(?s)<pre[^>]+id="lyrics-original"[^>]*>(.*?)</pre>',
webpage, 'song lyrics', fatal=False)
if description:
description = re.sub(' *\r *', '\n', description)
- enc_token = self._html_search_regex(
- r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token')
- token = ''.join(
- c if pos == 3 else compat_chr(compat_ord(c) - 1)
- for pos, c in enumerate(reversed(enc_token)))
- video_url = 'http://x-minus.org/dwlf/%s/%s.mp3' % (video_id, token)
+ k = self._search_regex(
+ r'<div[^>]+id="player-bottom"[^>]+data-k="([^"]+)">', webpage,
+ 'encoded data')
+ h = time.time() / 3600
+ a = sum(map(int, [compat_ord(c) for c in k])) + int(video_id) + h
+ video_url = 'http://x-minus.me/dl/minus?id=%s&tkn2=%df%d' % (video_id, a, h)
return {
'id': video_id,
'title': title,
'url': video_url,
+ # The extension is unknown until actual downloading
+ 'ext': 'mp3',
'duration': duration,
'filesize_approx': filesize_approx,
'tbr': tbr,
diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py
index 5a41f8f..bcb1403 100644
--- a/youtube_dl/extractor/xnxx.py
+++ b/youtube_dl/extractor/xnxx.py
@@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote
class XNXXIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
- _TEST = {
- 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
- 'md5': '0831677e2b4761795f68d417e0b7b445',
+ _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/'
+ _TESTS = [{
+ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video',
+ 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0',
'info_dict': {
- 'id': '1135332',
+ 'id': '55awb78',
'ext': 'flv',
- 'title': 'lida » Naked Funny Actress (5)',
+ 'title': 'Skyrim Test Video',
'age_limit': 18,
- }
- }
+ },
+ }, {
+ 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.xnxx.com/video-55awb78/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py
index 2466410..0be8932 100644
--- a/youtube_dl/extractor/xuite.py
+++ b/youtube_dl/extractor/xuite.py
@@ -66,6 +66,7 @@ class XuiteIE(InfoExtractor):
'uploader_id': '242127761',
'categories': ['電玩動漫'],
},
+ 'skip': 'Video removed',
}, {
'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9',
'only_matching': True,
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 710ad50..1dfe031 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -8,7 +8,6 @@ from ..utils import (
clean_html,
ExtractorError,
determine_ext,
- sanitized_Request,
)
@@ -25,8 +24,6 @@ class XVideosIE(InfoExtractor):
}
}
- _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19'
-
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -35,31 +32,34 @@ class XVideosIE(InfoExtractor):
if mobj:
raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
- video_url = compat_urllib_parse_unquote(
- self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
video_title = self._html_search_regex(
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
- formats = [{
- 'url': video_url,
- }]
+ formats = []
- android_req = sanitized_Request(url)
- android_req.add_header('User-Agent', self._ANDROID_USER_AGENT)
- android_webpage = self._download_webpage(android_req, video_id, fatal=False)
+ video_url = compat_urllib_parse_unquote(self._search_regex(
+ r'flv_url=(.+?)&', webpage, 'video URL', default=''))
+ if video_url:
+ formats.append({'url': video_url})
- if android_webpage is not None:
- player_params_str = self._search_regex(
- 'mobileReplacePlayerDivTwoQual\(([^)]+)\)',
- android_webpage, 'player parameters', default='')
- player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(',')))
- if player_params:
- formats.extend([{
- 'url': param,
- 'preference': -10,
- } for param in player_params if determine_ext(param) == 'mp4'])
+ player_args = self._search_regex(
+ r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None)
+ if player_args:
+ for arg in player_args.split(','):
+ format_url = self._search_regex(
+ r'(["\'])(?P<url>https?://.+?)\1', arg, 'url',
+ default=None, group='url')
+ if not format_url:
+ continue
+ ext = determine_ext(format_url)
+ if ext == 'mp4':
+ formats.append({'url': format_url})
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
self._sort_formats(formats)
@@ -67,7 +67,6 @@ class XVideosIE(InfoExtractor):
'id': video_id,
'formats': formats,
'title': video_title,
- 'ext': 'flv',
'thumbnail': video_thumbnail,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 4c61429..927a964 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -8,6 +8,7 @@ import re
from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urlparse,
)
from ..utils import (
@@ -23,7 +24,7 @@ from .nbc import NBCSportsVPlayerIE
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
- _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
+ _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?)'
_TESTS = [
{
'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
@@ -37,7 +38,7 @@ class YahooIE(InfoExtractor):
},
{
'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- 'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
+ 'md5': 'c3466d2b6d5dd6b9f41ba9ed04c24b23',
'info_dict': {
'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
'ext': 'mp4',
@@ -48,7 +49,7 @@ class YahooIE(InfoExtractor):
},
{
'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
- 'md5': '60e8ac193d8fb71997caa8fce54c6460',
+ 'md5': '75ffabdb87c16d4ffe8c036dc4d1c136',
'info_dict': {
'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
'ext': 'mp4',
@@ -58,15 +59,15 @@ class YahooIE(InfoExtractor):
}
},
{
- 'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html',
- 'md5': '3a09cf59349cfaddae1797acc3c087fc',
+ 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html',
+ 'md5': '9035d38f88b1782682a3e89f985be5bb',
'info_dict': {
'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
'ext': 'mp4',
'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
'description': '直言台南沒捷運 交通居五都之末',
'duration': 396,
- }
+ },
},
{
'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
@@ -88,17 +89,32 @@ class YahooIE(InfoExtractor):
'title': 'Program that makes hockey more affordable not offered in Manitoba',
'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
'duration': 121,
- }
+ },
+ 'skip': 'Video gone',
}, {
'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
- 'md5': '226a895aae7e21b0129e2a2006fe9690',
'info_dict': {
- 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
- 'ext': 'mp4',
- 'title': '\'The Interview\' TV Spot: War',
- 'description': 'The Interview',
- 'duration': 30,
- }
+ 'id': '154609075',
+ },
+ 'playlist': [{
+ 'md5': 'f8e336c6b66f503282e5f719641d6565',
+ 'info_dict': {
+ 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
+ 'ext': 'mp4',
+ 'title': '\'The Interview\' TV Spot: War',
+ 'description': 'The Interview',
+ 'duration': 30,
+ },
+ }, {
+ 'md5': '958bcb90b4d6df71c56312137ee1cd5a',
+ 'info_dict': {
+ 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9',
+ 'ext': 'mp4',
+ 'title': '\'The Interview\' TV Spot: Guys',
+ 'description': 'The Interview',
+ 'duration': 30,
+ },
+ }],
}, {
'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
'md5': '88e209b417f173d86186bef6e4d1f160',
@@ -118,10 +134,11 @@ class YahooIE(InfoExtractor):
'title': 'Connect the Dots: Dark Side of Virgo',
'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
'duration': 201,
- }
+ },
+ 'skip': 'Domain name in.lifestyle.yahoo.com gone',
}, {
'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
- 'md5': '989396ae73d20c6f057746fb226aa215',
+ 'md5': 'b17ac378b1134fa44370fb27db09a744',
'info_dict': {
'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
'ext': 'mp4',
@@ -140,6 +157,9 @@ class YahooIE(InfoExtractor):
'ext': 'flv',
'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ 'upload_date': '20150313',
+ 'uploader': 'NBCU-SPORTS',
+ 'timestamp': 1426270238,
}
}, {
'url': 'https://tw.news.yahoo.com/-100120367.html',
@@ -147,7 +167,7 @@ class YahooIE(InfoExtractor):
}, {
# Query result is embedded in webpage, but explicit request to video API fails with geo restriction
'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
- 'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
+ 'md5': '1ddbf7c850777548438e5c4f147c7b8c',
'info_dict': {
'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
'ext': 'mp4',
@@ -165,6 +185,17 @@ class YahooIE(InfoExtractor):
'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.',
},
},
+ {
+ # config['models']['applet_model']['data']['sapi'] has no query
+ 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016',
+ 'md5': 'dac0c72d502bc5facda80c9e6d5c98db',
+ 'info_dict': {
+ 'id': 'a6015640-e9e5-3efb-bb60-05589a183919',
+ 'ext': 'mp4',
+ 'description': 'Galactic',
+ 'title': 'Dolla Diva (feat. Maggie Koerner)',
+ },
+ },
]
def _real_extract(self, url):
@@ -173,19 +204,26 @@ class YahooIE(InfoExtractor):
page_id = mobj.group('id')
url = mobj.group('url')
host = mobj.group('host')
- webpage = self._download_webpage(url, display_id)
+ webpage, urlh = self._download_webpage_handle(url, display_id)
+ if 'err=404' in urlh.geturl():
+ raise ExtractorError('Video gone', expected=True)
# Look for iframed media first
- iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
- if iframe_m:
+ entries = []
+ iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
+ for idx, iframe_url in enumerate(iframe_urls):
iframepage = self._download_webpage(
- host + iframe_m.group(1), display_id, 'Downloading iframe webpage')
+ host + iframe_url, display_id,
+ note='Downloading iframe webpage for video #%d' % idx)
items_json = self._search_regex(
r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
if items_json:
items = json.loads(items_json)
video_id = items[0]['id']
- return self._get_info(video_id, display_id, webpage)
+ entries.append(self._get_info(video_id, display_id, webpage))
+ if entries:
+ return self.playlist_result(entries, page_id)
+
# Look for NBCSports iframes
nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
if nbc_sports_url:
@@ -201,7 +239,7 @@ class YahooIE(InfoExtractor):
config = self._parse_json(config_json, display_id, fatal=False)
if config:
sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
- if sapi:
+ if sapi and 'query' in sapi:
return self._extract_info(display_id, sapi, webpage)
items_json = self._search_regex(
@@ -303,9 +341,9 @@ class YahooIE(InfoExtractor):
region = self._search_regex(
r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
webpage, 'region', fatal=False, default='US')
- data = compat_urllib_parse.urlencode({
+ data = compat_urllib_parse_urlencode({
'protocol': 'http',
- 'region': region,
+ 'region': region.upper(),
})
query_url = (
'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py
index 001ee17..63bbc06 100644
--- a/youtube_dl/extractor/yam.py
+++ b/youtube_dl/extractor/yam.py
@@ -15,7 +15,7 @@ from ..utils import (
class YamIE(InfoExtractor):
IE_DESC = '蕃薯藤yam天空部落'
- _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)'
+ _VALID_URL = r'https?://mymedia.yam.com/m/(?P<id>\d+)'
_TESTS = [{
# An audio hosted on Yam
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index d3cc1a2..b37d0ea 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -5,18 +5,48 @@ import re
import hashlib
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urllib_parse,
-)
+from ..compat import compat_str
from ..utils import (
+ ExtractorError,
int_or_none,
float_or_none,
- sanitized_Request,
)
-class YandexMusicTrackIE(InfoExtractor):
+class YandexMusicBaseIE(InfoExtractor):
+ @staticmethod
+ def _handle_error(response):
+ if isinstance(response, dict):
+ error = response.get('error')
+ if error:
+ raise ExtractorError(error, expected=True)
+ if response.get('type') == 'captcha' or 'captcha' in response:
+ YandexMusicBaseIE._raise_captcha()
+
+ @staticmethod
+ def _raise_captcha():
+ raise ExtractorError(
+ 'YandexMusic has considered youtube-dl requests automated and '
+ 'asks you to solve a CAPTCHA. You can either wait for some '
+ 'time until unblocked and optionally use --sleep-interval '
+ 'in future or alternatively you can go to https://music.yandex.ru/ '
+ 'solve CAPTCHA, then export cookies and pass cookie file to '
+ 'youtube-dl with --cookies',
+ expected=True)
+
+ def _download_webpage(self, *args, **kwargs):
+ webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs)
+ if 'Нам очень жаль, но&nbsp;запросы, поступившие с&nbsp;вашего IP-адреса, похожи на&nbsp;автоматические.' in webpage:
+ self._raise_captcha()
+ return webpage
+
+ def _download_json(self, *args, **kwargs):
+ response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
+ self._handle_error(response)
+ return response
+
+
+class YandexMusicTrackIE(YandexMusicBaseIE):
IE_NAME = 'yandexmusic:track'
IE_DESC = 'Яндекс.Музыка - Трек'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
@@ -27,10 +57,16 @@ class YandexMusicTrackIE(InfoExtractor):
'info_dict': {
'id': '4878838',
'ext': 'mp3',
- 'title': 'Carlo Ambrosio - Gypsy Eyes 1',
+ 'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1',
'filesize': 4628061,
'duration': 193.04,
- }
+ 'track': 'Gypsy Eyes 1',
+ 'album': 'Gypsy Soul',
+ 'album_artist': 'Carlo Ambrosio',
+ 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio',
+ 'release_year': '2009',
+ },
+ 'skip': 'Travis CI servers blocked by YandexMusic',
}
def _get_track_url(self, storage_dir, track_id):
@@ -52,16 +88,45 @@ class YandexMusicTrackIE(InfoExtractor):
thumbnail = cover_uri.replace('%%', 'orig')
if not thumbnail.startswith('http'):
thumbnail = 'http://' + thumbnail
- return {
+
+ track_title = track['title']
+ track_info = {
'id': track['id'],
'ext': 'mp3',
'url': self._get_track_url(track['storageDir'], track['id']),
- 'title': '%s - %s' % (track['artists'][0]['name'], track['title']),
'filesize': int_or_none(track.get('fileSize')),
'duration': float_or_none(track.get('durationMs'), 1000),
'thumbnail': thumbnail,
+ 'track': track_title,
}
+ def extract_artist(artist_list):
+ if artist_list and isinstance(artist_list, list):
+ artists_names = [a['name'] for a in artist_list if a.get('name')]
+ if artists_names:
+ return ', '.join(artists_names)
+
+ albums = track.get('albums')
+ if albums and isinstance(albums, list):
+ album = albums[0]
+ if isinstance(album, dict):
+ year = album.get('year')
+ track_info.update({
+ 'album': album.get('title'),
+ 'album_artist': extract_artist(album.get('artists')),
+ 'release_year': compat_str(year) if year else None,
+ })
+
+ track_artist = extract_artist(track.get('artists'))
+ if track_artist:
+ track_info.update({
+ 'artist': track_artist,
+ 'title': '%s - %s' % (track_artist, track_title),
+ })
+ else:
+ track_info['title'] = track_title
+ return track_info
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
album_id, track_id = mobj.group('album_id'), mobj.group('id')
@@ -73,7 +138,7 @@ class YandexMusicTrackIE(InfoExtractor):
return self._get_track_info(track)
-class YandexMusicPlaylistBaseIE(InfoExtractor):
+class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
def _build_playlist(self, tracks):
return [
self.url_result(
@@ -93,6 +158,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
'title': 'Carlo Ambrosio - Gypsy Soul (2009)',
},
'playlist_count': 50,
+ 'skip': 'Travis CI servers blocked by YandexMusic',
}
def _real_extract(self, url):
@@ -115,7 +181,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
IE_NAME = 'yandexmusic:playlist'
IE_DESC = 'Яндекс.Музыка - Плейлист'
- _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)'
+ _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
_TESTS = [{
'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
@@ -125,6 +191,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
},
'playlist_count': 6,
+ 'skip': 'Travis CI servers blocked by YandexMusic',
}, {
# playlist exceeding the limit of 150 tracks shipped with webpage (see
# https://github.com/rg3/youtube-dl/issues/6666)
@@ -133,46 +200,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'id': '1036',
'title': 'Музыка 90-х',
},
- 'playlist_count': 310,
+ 'playlist_mincount': 300,
+ 'skip': 'Travis CI servers blocked by YandexMusic',
}]
def _real_extract(self, url):
- playlist_id = self._match_id(url)
-
- webpage = self._download_webpage(url, playlist_id)
-
- mu = self._parse_json(
- self._search_regex(
- r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
- playlist_id)
-
- playlist = mu['pageData']['playlist']
- tracks, track_ids = playlist['tracks'], playlist['trackIds']
-
- # tracks dictionary shipped with webpage is limited to 150 tracks,
+ mobj = re.match(self._VALID_URL, url)
+ tld = mobj.group('tld')
+ user = mobj.group('user')
+ playlist_id = mobj.group('id')
+
+ playlist = self._download_json(
+ 'https://music.yandex.%s/handlers/playlist.jsx' % tld,
+ playlist_id, 'Downloading missing tracks JSON',
+ fatal=False,
+ headers={
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'X-Retpath-Y': url,
+ },
+ query={
+ 'owner': user,
+ 'kinds': playlist_id,
+ 'light': 'true',
+ 'lang': tld,
+ 'external-domain': 'music.yandex.%s' % tld,
+ 'overembed': 'false',
+ })['playlist']
+
+ tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+
+ # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
# missing tracks should be retrieved manually.
if len(tracks) < len(track_ids):
- present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')])
- missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids)
- request = sanitized_Request(
- 'https://music.yandex.ru/handlers/track-entries.jsx',
- compat_urllib_parse.urlencode({
+ present_track_ids = set([
+ compat_str(track['id'])
+ for track in tracks if track.get('id')])
+ missing_track_ids = [
+ track_id for track_id in track_ids
+ if track_id not in present_track_ids]
+ missing_tracks = self._download_json(
+ 'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
+ playlist_id, 'Downloading missing tracks JSON',
+ fatal=False,
+ headers={
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ },
+ query={
'entries': ','.join(missing_track_ids),
- 'lang': mu.get('settings', {}).get('lang', 'en'),
- 'external-domain': 'music.yandex.ru',
+ 'lang': tld,
+ 'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
- 'sign': mu.get('authData', {}).get('user', {}).get('sign'),
'strict': 'true',
- }).encode('utf-8'))
- request.add_header('Referer', url)
- request.add_header('X-Requested-With', 'XMLHttpRequest')
-
- missing_tracks = self._download_json(
- request, playlist_id, 'Downloading missing tracks JSON', fatal=False)
+ })
if missing_tracks:
tracks.extend(missing_tracks)
return self.playlist_result(
self._build_playlist(tracks),
compat_str(playlist_id),
- playlist['title'], playlist.get('description'))
+ playlist.get('title'), playlist.get('description'))
diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py
index 869f3e8..0d943c3 100644
--- a/youtube_dl/extractor/ynet.py
+++ b/youtube_dl/extractor/ynet.py
@@ -9,7 +9,7 @@ from ..compat import compat_urllib_parse_unquote_plus
class YnetIE(InfoExtractor):
- _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html'
+ _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html'
_TESTS = [
{
'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html',
@@ -41,10 +41,12 @@ class YnetIE(InfoExtractor):
m = re.search(r'ynet - HOT -- (["\']+)(?P<title>.+?)\1', title)
if m:
title = m.group('title')
+ formats = self._extract_f4m_formats(f4m_url, video_id)
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
- 'formats': self._extract_f4m_formats(f4m_url, video_id),
+ 'formats': formats,
'thumbnail': self._og_search_thumbnail(webpage),
}
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 900eb2a..147608e 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -2,17 +2,20 @@
from __future__ import unicode_literals
import base64
+import itertools
import random
+import re
import string
import time
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_ord,
)
from ..utils import (
ExtractorError,
+ get_element_by_attribute,
sanitized_Request,
)
@@ -64,6 +67,14 @@ class YoukuIE(InfoExtractor):
'params': {
'videopassword': '100600',
},
+ }, {
+ # /play/get.json contains streams with "channel_type":"tail"
+ 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html',
+ 'info_dict': {
+ 'id': 'XOTUxMzg4NDMy',
+ 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft',
+ },
+ 'playlist_count': 6,
}]
def construct_video_urls(self, data):
@@ -92,6 +103,8 @@ class YoukuIE(InfoExtractor):
fileid_dict = {}
for stream in data['stream']:
+ if stream.get('channel_type') == 'tail':
+ continue
format = stream.get('stream_type')
fileid = stream['stream_fileid']
fileid_dict[format] = fileid
@@ -117,6 +130,8 @@ class YoukuIE(InfoExtractor):
# generate video_urls
video_urls_dict = {}
for stream in data['stream']:
+ if stream.get('channel_type') == 'tail':
+ continue
format = stream.get('stream_type')
video_urls = []
for dt in stream['segs']:
@@ -138,7 +153,7 @@ class YoukuIE(InfoExtractor):
'_00' + \
'/st/' + self.parse_ext_l(format) + \
'/fileid/' + get_fileid(format, n) + '?' + \
- compat_urllib_parse.urlencode(param)
+ compat_urllib_parse_urlencode(param)
video_urls.append(video_url)
video_urls_dict[format] = video_urls
@@ -253,6 +268,8 @@ class YoukuIE(InfoExtractor):
# which one has all
} for i in range(max(len(v.get('segs')) for v in data['stream']))]
for stream in data['stream']:
+ if stream.get('channel_type') == 'tail':
+ continue
fm = stream.get('stream_type')
video_urls = video_urls_dict[fm]
for video_url, seg, entry in zip(video_urls, stream['segs'], entries):
@@ -261,6 +278,8 @@ class YoukuIE(InfoExtractor):
'format_id': self.get_format_name(fm),
'ext': self.parse_ext_l(fm),
'filesize': int(seg['size']),
+ 'width': stream.get('width'),
+ 'height': stream.get('height'),
})
return {
@@ -269,3 +288,52 @@ class YoukuIE(InfoExtractor):
'title': title,
'entries': entries,
}
+
+
+class YoukuShowIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html'
+ IE_NAME = 'youku:show'
+
+ _TEST = {
+ 'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html',
+ 'info_dict': {
+ 'id': 'zc7c670be07ff11e48b3f',
+ 'title': '花千骨 未删减版',
+ 'description': 'md5:578d4f2145ae3f9128d9d4d863312910',
+ },
+ 'playlist_count': 50,
+ }
+
+ _PAGE_SIZE = 40
+
+ def _find_videos_in_page(self, webpage):
+ videos = re.findall(
+ r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage)
+ return [
+ self.url_result(video_url, YoukuIE.ie_key(), title)
+ for video_url, title in videos]
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
+
+ entries = self._find_videos_in_page(webpage)
+
+ playlist_title = self._html_search_regex(
+ r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False)
+ detail_div = get_element_by_attribute('class', 'detail', webpage) or ''
+ playlist_description = self._html_search_regex(
+ r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>',
+ detail_div, 'playlist description', fatal=False)
+
+ for idx in itertools.count(1):
+ episodes_page = self._download_webpage(
+ 'http://www.youku.com/show_episode/id_%s.html' % show_id,
+ show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)},
+ note='Downloading episodes page %d' % idx)
+ new_entries = self._find_videos_in_page(episodes_page)
+ entries.extend(new_entries)
+ if len(new_entries) < self._PAGE_SIZE:
+ break
+
+ return self.playlist_result(entries, show_id, playlist_title, playlist_description)
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index b29baaf..0df2d76 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -17,7 +17,7 @@ class YouPornIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?youporn\.com/watch/(?P<id>\d+)/(?P<display_id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
- 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
+ 'md5': '3744d24c50438cf5b6f6d59feb5055c2',
'info_dict': {
'id': '505835',
'display_id': 'sex-ed-is-it-safe-to-masturbate-daily',
@@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor):
links = []
sources = self._search_regex(
- r'sources\s*:\s*({.+?})', webpage, 'sources', default=None)
+ r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
if sources:
for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
links.append(link)
@@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor):
}
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+ # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# We will benefit from it by extracting some metadata
- mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+ mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
if mobj:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
@@ -120,21 +121,21 @@ class YouPornIE(InfoExtractor):
webpage, 'thumbnail', fatal=False, group='thumbnail')
uploader = self._html_search_regex(
- r'(?s)<div[^>]+class=["\']videoInfoBy(?:\s+[^"\']+)?["\'][^>]*>\s*By:\s*</div>(.+?)</(?:a|div)>',
+ r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
- r'(?s)<div[^>]+class=["\']videoInfoTime["\'][^>]*>(.+?)</div>',
+ r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>',
webpage, 'upload date', fatal=False))
age_limit = self._rta_search(webpage)
average_rating = int_or_none(self._search_regex(
- r'<div[^>]+class=["\']videoInfoRating["\'][^>]*>\s*<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
+ r'<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
webpage, 'average rating', fatal=False))
view_count = str_to_int(self._search_regex(
- r'(?s)<div[^>]+class=["\']videoInfoViews["\'][^>]*>.*?([\d,.]+)\s*</div>',
- webpage, 'view count', fatal=False))
+ r'(?s)<div[^>]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P<count>[\d,.]+)<',
+ webpage, 'view count', fatal=False, group='count'))
comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', fatal=False))
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index e24dd3e..c8d54f2 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
import itertools
import json
import os.path
+import random
import re
import time
import traceback
@@ -16,16 +17,15 @@ from ..swfinterp import SWFInterpreter
from ..compat import (
compat_chr,
compat_parse_qs,
- compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_str,
)
from ..utils import (
clean_html,
- encode_dict,
error_to_compat_str,
ExtractorError,
float_or_none,
@@ -44,6 +44,7 @@ from ..utils import (
unified_strdate,
unsmuggle_url,
uppercase_escape,
+ urlencode_postdata,
ISO3166Utils,
)
@@ -115,7 +116,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'hl': 'en_US',
}
- login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
+ login_data = urlencode_postdata(login_form_strs)
req = sanitized_Request(self._LOGIN_URL, login_data)
login_results = self._download_webpage(
@@ -124,6 +125,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if login_results is False:
return False
+ error_msg = self._html_search_regex(
+ r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',
+ login_results, 'error message', default=None)
+ if error_msg:
+ raise ExtractorError('Unable to login: %s' % error_msg, expected=True)
+
if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
@@ -148,7 +155,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
'TrustDevice': 'on',
})
- tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
+ tfa_data = urlencode_postdata(tfa_form_strs)
tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
tfa_results = self._download_webpage(
@@ -233,7 +240,9 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
def _process_page(self, content):
- for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)):
+ for playlist_id in orderedSet(re.findall(
+ r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
+ content)):
yield self.url_result(
'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
@@ -267,7 +276,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
))
|(?:
youtu\.be| # just youtu.be/xxxx
- vid\.plus # or vid.plus/xxxx
+ vid\.plus| # or vid.plus/xxxx
+ zwearz\.com/watch| # or zwearz.com/watch/xxxx
)/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
@@ -308,6 +318,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming
+ '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
@@ -333,6 +344,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
'140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
+ '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
+ '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
# Dash webm
'167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
@@ -382,7 +395,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@@ -401,12 +416,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
'alt_title': 'I Love It (feat. Charli XCX)',
- 'description': 'md5:782e8651347686cba06e58f71ab51773',
+ 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop',
+ 'license': 'Standard YouTube License',
'creator': 'Icona Pop',
}
},
@@ -422,6 +439,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:64249768eec3bc4276236606ea996373',
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
+ 'license': 'Standard YouTube License',
'creator': 'Justin Timberlake',
'age_limit': 18,
}
@@ -437,6 +456,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
'uploader': 'SET India',
'uploader_id': 'setindia',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
}
},
@@ -449,7 +470,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'youtube-dl test video "\'/\\ä↭𝕐',
'uploader': 'Philipp Hagemeister',
'uploader_id': 'phihag',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
+ 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
@@ -468,14 +491,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'm4a',
'upload_date': '20121002',
'uploader_id': '8KVIDEO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
+ 'license': 'Standard YouTube License',
'title': 'UHDTV TEST 8K VIDEO.mp4'
},
'params': {
'youtube_include_dash_manifest': True,
'format': '141',
},
+ 'skip': 'format 141 not served anymore',
},
# DASH manifest with encrypted signature
{
@@ -488,10 +514,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
+ 'license': 'Standard YouTube License',
},
'params': {
'youtube_include_dash_manifest': True,
- 'format': '141',
+ 'format': '141/bestaudio[ext=m4a]',
},
},
# JS player signature function name containing $
@@ -506,11 +533,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
+ 'license': 'Standard YouTube License',
'creator': 'Taylor Swift',
},
'params': {
'youtube_include_dash_manifest': True,
- 'format': '141',
+ 'format': '141/bestaudio[ext=m4a]',
},
},
# Controversy video
@@ -522,6 +550,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20100909',
'uploader': 'The Amazing Atheist',
'uploader_id': 'TheAmazingAtheist',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
+ 'license': 'Standard YouTube License',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
@@ -536,7 +566,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',
'uploader': 'The Witcher',
'uploader_id': 'WitcherGame',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@@ -550,7 +582,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
'uploader': 'LloydVEVO',
'uploader_id': 'LloydVEVO',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
'upload_date': '20110629',
+ 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
@@ -562,9 +596,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20100430',
'uploader_id': 'deadmau5',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',
'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5',
+ 'license': 'Standard YouTube License',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
},
@@ -580,8 +616,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4',
'upload_date': '20150827',
'uploader_id': 'olympic',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic',
+ 'license': 'Standard YouTube License',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
- 'uploader': 'Olympics',
+ 'uploader': 'Olympic',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
},
'params': {
@@ -597,8 +635,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'stretched_ratio': 16 / 9.,
'upload_date': '20110310',
'uploader_id': 'AllenMeow',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫艾倫',
+ 'license': 'Standard YouTube License',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
},
@@ -629,8 +669,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:116377fd2963b81ec4ce64b542173306',
'upload_date': '20150625',
'uploader_id': 'dorappi2000',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000',
- 'formats': 'mincount:33',
+ 'license': 'Standard YouTube License',
+ 'formats': 'mincount:32',
},
},
# DASH manifest with segment_list
@@ -644,12 +686,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'Airtek',
'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
+ 'license': 'Standard YouTube License',
'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
},
'params': {
'youtube_include_dash_manifest': True,
'format': '135', # bestvideo
- }
+ },
+ 'skip': 'This live event has ended.',
},
{
# Multifeed videos (multiple cameras), URL is for Main Camera
@@ -668,6 +712,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -678,6 +724,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -688,6 +736,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}, {
'info_dict': {
@@ -698,6 +748,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20150721',
'uploader': 'Beer Games Beer',
'uploader_id': 'beergamesbeer',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
+ 'license': 'Standard YouTube License',
},
}],
'params': {
@@ -712,12 +764,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30',
},
'playlist_count': 2,
+ 'skip': 'Not multifeed anymore',
},
{
'url': 'http://vid.plus/FlRa-iH7PGw',
'only_matching': True,
},
{
+ 'url': 'http://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html',
+ 'only_matching': True,
+ },
+ {
# Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468)
# Also tests cut-off URL expansion in video description (see
# https://github.com/rg3/youtube-dl/issues/1892,
@@ -731,7 +788,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',
'upload_date': '20151119',
'uploader_id': 'IronSoulElf',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf',
+ 'license': 'Standard YouTube License',
'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',
},
'params': {
@@ -758,6 +817,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'This video does not exist.',
+ },
+ {
+ # Video licensed under Creative Commons
+ 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
+ 'info_dict': {
+ 'id': 'M4gD1WSo5mA',
+ 'ext': 'mp4',
+ 'title': 'md5:e41008789470fc2533a3252216f1c1d1',
+ 'description': 'md5:a677553cf0840649b731a3024aeff4cc',
+ 'upload_date': '20150127',
+ 'uploader_id': 'BerkmanCenter',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter',
+ 'uploader': 'BerkmanCenter',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Channel-like uploader_url
+ 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg',
+ 'info_dict': {
+ 'id': 'eQcmzGIKrzg',
+ 'ext': 'mp4',
+ 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
+ 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+ 'upload_date': '20151119',
+ 'uploader': 'Bernie 2016',
+ 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg',
+ 'license': 'Creative Commons Attribution license (reuse allowed)',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
},
{
'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
@@ -930,7 +1026,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continue
sub_formats = []
for ext in self._SUBTITLE_FORMATS:
- params = compat_urllib_parse.urlencode({
+ params = compat_urllib_parse_urlencode({
'lang': lang,
'v': video_id,
'fmt': ext,
@@ -975,40 +1071,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return {}
try:
args = player_config['args']
- caption_url = args['ttsurl']
- if not caption_url:
- self._downloader.report_warning(err_msg)
- return {}
- timestamp = args['timestamp']
- # We get the available subtitles
- list_params = compat_urllib_parse.urlencode({
- 'type': 'list',
- 'tlangs': 1,
- 'asrs': 1,
- })
- list_url = caption_url + '&' + list_params
- caption_list = self._download_xml(list_url, video_id)
- original_lang_node = caption_list.find('track')
- if original_lang_node is None:
- self._downloader.report_warning('Video doesn\'t have automatic captions')
- return {}
- original_lang = original_lang_node.attrib['lang_code']
- caption_kind = original_lang_node.attrib.get('kind', '')
+ caption_url = args.get('ttsurl')
+ if caption_url:
+ timestamp = args['timestamp']
+ # We get the available subtitles
+ list_params = compat_urllib_parse_urlencode({
+ 'type': 'list',
+ 'tlangs': 1,
+ 'asrs': 1,
+ })
+ list_url = caption_url + '&' + list_params
+ caption_list = self._download_xml(list_url, video_id)
+ original_lang_node = caption_list.find('track')
+ if original_lang_node is None:
+ self._downloader.report_warning('Video doesn\'t have automatic captions')
+ return {}
+ original_lang = original_lang_node.attrib['lang_code']
+ caption_kind = original_lang_node.attrib.get('kind', '')
+
+ sub_lang_list = {}
+ for lang_node in caption_list.findall('target'):
+ sub_lang = lang_node.attrib['lang_code']
+ sub_formats = []
+ for ext in self._SUBTITLE_FORMATS:
+ params = compat_urllib_parse_urlencode({
+ 'lang': original_lang,
+ 'tlang': sub_lang,
+ 'fmt': ext,
+ 'ts': timestamp,
+ 'kind': caption_kind,
+ })
+ sub_formats.append({
+ 'url': caption_url + '&' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[sub_lang] = sub_formats
+ return sub_lang_list
+
+ # Some videos don't provide ttsurl but rather caption_tracks and
+ # caption_translation_languages (e.g. 20LmZk1hakA)
+ caption_tracks = args['caption_tracks']
+ caption_translation_languages = args['caption_translation_languages']
+ caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+ parsed_caption_url = compat_urllib_parse_urlparse(caption_url)
+ caption_qs = compat_parse_qs(parsed_caption_url.query)
sub_lang_list = {}
- for lang_node in caption_list.findall('target'):
- sub_lang = lang_node.attrib['lang_code']
+ for lang in caption_translation_languages.split(','):
+ lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+ sub_lang = lang_qs.get('lc', [None])[0]
+ if not sub_lang:
+ continue
sub_formats = []
for ext in self._SUBTITLE_FORMATS:
- params = compat_urllib_parse.urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': ext,
- 'ts': timestamp,
- 'kind': caption_kind,
+ caption_qs.update({
+ 'tlang': [sub_lang],
+ 'fmt': [ext],
})
+ sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace(
+ query=compat_urllib_parse_urlencode(caption_qs, True)))
sub_formats.append({
- 'url': caption_url + '&' + params,
+ 'url': sub_url,
'ext': ext,
})
sub_lang_list[sub_lang] = sub_formats
@@ -1019,6 +1142,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._downloader.report_warning(err_msg)
return {}
+ def _mark_watched(self, video_id, video_info):
+ playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+ if not playback_url:
+ return
+ parsed_playback_url = compat_urlparse.urlparse(playback_url)
+ qs = compat_urlparse.parse_qs(parsed_playback_url.query)
+
+ # cpn generation algorithm is reverse engineered from base.js.
+ # In fact it works even with dummy cpn.
+ CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_'
+ cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)))
+
+ qs.update({
+ 'ver': ['2'],
+ 'cpn': [cpn],
+ })
+ playback_url = compat_urlparse.urlunparse(
+ parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+ self._download_webpage(
+ playback_url, video_id, 'Marking watched',
+ 'Unable to mark watched', fatal=False)
+
@classmethod
def extract_id(cls, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
@@ -1098,7 +1244,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# this can be viewed without login into Youtube
url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
- data = compat_urllib_parse.urlencode({
+ data = compat_urllib_parse_urlencode({
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'sts': self._search_regex(
@@ -1186,10 +1332,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if video_description:
video_description = re.sub(r'''(?x)
<a\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ (?:[a-zA-Z-]+="[^"]*"\s+)*?
(?:title|href)="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]+"\s+)*?
- class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*>
+ (?:[a-zA-Z-]+="[^"]*"\s+)*?
+ class="[^"]*"[^>]*>
[^<]+\.{3}\s*
</a>
''', r'\1', video_description)
@@ -1245,9 +1391,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# uploader_id
video_uploader_id = None
- mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
+ video_uploader_url = None
+ mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+ video_webpage)
if mobj is not None:
- video_uploader_id = mobj.group(1)
+ video_uploader_id = mobj.group('uploader_id')
+ video_uploader_url = mobj.group('uploader_url')
else:
self._downloader.report_warning('unable to extract uploader nickname')
@@ -1275,6 +1425,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
+ video_license = self._html_search_regex(
+ r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+ video_webpage, 'license', default=None)
+
m_music = re.search(
r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
video_webpage)
@@ -1348,6 +1502,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
+ formats_spec = {}
+ fmt_list = video_info.get('fmt_list', [''])[0]
+ if fmt_list:
+ for fmt in fmt_list.split(','):
+ spec = fmt.split('/')
+ if len(spec) > 1:
+ width_height = spec[1].split('x')
+ if len(width_height) == 2:
+ formats_spec[spec[0]] = {
+ 'resolution': spec[1],
+ 'width': int_or_none(width_height[0]),
+ 'height': int_or_none(width_height[1]),
+ }
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
@@ -1416,6 +1583,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
if format_id in self._formats:
dct.update(self._formats[format_id])
+ if format_id in formats_spec:
+ dct.update(formats_spec[format_id])
# Some itags are not included in DASH manifest thus corresponding formats will
# lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
@@ -1528,11 +1697,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self._sort_formats(formats)
+ self.mark_watched(video_id, video_info)
+
return {
'id': video_id,
'uploader': video_uploader,
'uploader_id': video_uploader_id,
+ 'uploader_url': video_uploader_url,
'upload_date': upload_date,
+ 'license': video_license,
'creator': video_creator,
'title': video_title,
'alt_title': video_alt_title,
@@ -1657,20 +1830,32 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
- url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
- webpage = self._download_webpage(
- url, playlist_id, 'Downloading Youtube mix')
+ ids = []
+ last_id = playlist_id[-11:]
+ for n in itertools.count(1):
+ url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+ webpage = self._download_webpage(
+ url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
+ new_ids = orderedSet(re.findall(
+ r'''(?xs)data-video-username=".*?".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
+ webpage))
+ # Fetch new pages until all the videos are repeated, it seems that
+ # there are always 51 unique videos.
+ new_ids = [_id for _id in new_ids if _id not in ids]
+ if not new_ids:
+ break
+ ids.extend(new_ids)
+ last_id = ids[-1]
+
+ url_results = self._ids_to_results(ids)
+
search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
title_span = (
search_title('playlist-title') or
search_title('title long-title') or
search_title('title'))
title = clean_html(title_span)
- ids = orderedSet(re.findall(
- r'''(?xs)data-video-username=".*?".*?
- href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
- webpage))
- url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, title)
@@ -1723,7 +1908,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
if video:
return video
- if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
+ if playlist_id.startswith(('RD', 'UL', 'PU')):
# Mixes require a custom extraction process
return self._extract_mix(playlist_id)
@@ -1757,7 +1942,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
@classmethod
def suitable(cls, url):
- return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)
+ return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
+ else super(YoutubeChannelIE, cls).suitable(url))
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -1806,7 +1992,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
class YoutubeUserIE(YoutubeChannelIE):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
IE_NAME = 'youtube:user'
@@ -1819,19 +2005,67 @@ class YoutubeUserIE(YoutubeChannelIE):
}, {
'url': 'ytuser:phihag',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/c/gametrailers',
+ 'only_matching': True,
}]
@classmethod
def suitable(cls, url):
# Don't return True if the url can be extracted with other youtube
# extractor, the regex would is too permissive and it would match.
- other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
- if any(ie.suitable(url) for ie in other_ies):
+ other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
+ if any(ie.suitable(url) for ie in other_yt_ies):
return False
else:
return super(YoutubeUserIE, cls).suitable(url)
+class YoutubeLiveIE(YoutubeBaseInfoExtractor):
+ IE_DESC = 'YouTube.com live streams'
+ _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+))/live'
+ IE_NAME = 'youtube:live'
+
+ _TESTS = [{
+ 'url': 'http://www.youtube.com/user/TheYoungTurks/live',
+ 'info_dict': {
+ 'id': 'a48o2S1cPoo',
+ 'ext': 'mp4',
+ 'title': 'The Young Turks - Live Main Show',
+ 'uploader': 'The Young Turks',
+ 'uploader_id': 'TheYoungTurks',
+ 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
+ 'upload_date': '20150715',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
+ 'categories': ['News & Politics'],
+ 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel_id = mobj.group('id')
+ base_url = mobj.group('base_url')
+ webpage = self._download_webpage(url, channel_id, fatal=False)
+ if webpage:
+ page_type = self._og_search_property(
+ 'type', webpage, 'page type', default=None)
+ video_id = self._html_search_meta(
+ 'videoId', webpage, 'video id', default=None)
+ if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id):
+ return self.url_result(video_id, YoutubeIE.ie_key())
+ return self.url_result(base_url)
+
+
class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
IE_DESC = 'YouTube.com user/channel playlists'
_VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
@@ -1885,7 +2119,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
'spf': 'navigate',
}
url_query.update(self._EXTRA_QUERY_ARGS)
- result_url = 'https://www.youtube.com/results?' + compat_urllib_parse.urlencode(url_query)
+ result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query)
data = self._download_json(
result_url, video_id='query "%s"' % query,
note='Downloading page %s' % pagenum,
@@ -1914,10 +2148,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
-class YoutubeSearchURLIE(InfoExtractor):
+class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com search URLs'
IE_NAME = 'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
@@ -1932,32 +2167,8 @@ class YoutubeSearchURLIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
query = compat_urllib_parse_unquote_plus(mobj.group('query'))
-
webpage = self._download_webpage(url, query)
- result_code = self._search_regex(
- r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
-
- part_codes = re.findall(
- r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code)
- entries = []
- for part_code in part_codes:
- part_title = self._html_search_regex(
- [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
- part_url_snippet = self._html_search_regex(
- r'(?s)href="([^"]+)"', part_code, 'item URL')
- part_url = compat_urlparse.urljoin(
- 'https://www.youtube.com/', part_url_snippet)
- entries.append({
- '_type': 'url',
- 'url': part_url,
- 'title': part_title,
- })
-
- return {
- '_type': 'playlist',
- 'entries': entries,
- 'title': query,
- }
+ return self.playlist_result(self._process_page(webpage), playlist_title=query)
class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index c619a75..2ef1772 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -85,6 +85,13 @@ class ZDFIE(InfoExtractor):
uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader')
uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id')
upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date'))
+ subtitles = {}
+ captions_url = doc.find('.//caption/url')
+ if captions_url is not None:
+ subtitles['de'] = [{
+ 'url': captions_url.text,
+ 'ext': 'ttml',
+ }]
def xml_to_thumbnails(fnode):
thumbnails = []
@@ -137,6 +144,10 @@ class ZDFIE(InfoExtractor):
formats.extend(self._extract_smil_formats(
video_url, video_id, fatal=False))
elif ext == 'm3u8':
+ # the certificates are misconfigured (see
+ # https://github.com/rg3/youtube-dl/issues/8665)
+ if video_url.startswith('https://'):
+ continue
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
elif ext == 'f4m':
@@ -186,6 +197,7 @@ class ZDFIE(InfoExtractor):
'uploader_id': uploader_id,
'upload_date': upload_date,
'formats': formats,
+ 'subtitles': subtitles,
}
def _real_extract(self, url):
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index a7440c5..9737f70 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -232,7 +232,7 @@ class JSInterpreter(object):
def extract_function(self, funcname):
func_m = re.search(
r'''(?x)
- (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
+ (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s*
\((?P<args>[^)]*)\)\s*
\{(?P<code>[^}]+)\}''' % (
re.escape(funcname), re.escape(funcname), re.escape(funcname)),
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 3afa8bb..99ce413 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -171,6 +171,14 @@ def parseOpts(overrideArguments=None):
default=False,
help='Do not extract the videos of a playlist, only list them.')
general.add_option(
+ '--mark-watched',
+ action='store_true', dest='mark_watched', default=False,
+ help='Mark videos watched (YouTube only)')
+ general.add_option(
+ '--no-mark-watched',
+ action='store_false', dest='mark_watched', default=False,
+ help='Do not mark videos watched (YouTube only)')
+ general.add_option(
'--no-color', '--no-colors',
action='store_true', dest='no_color',
default=False,
@@ -180,7 +188,10 @@ def parseOpts(overrideArguments=None):
network.add_option(
'--proxy', dest='proxy',
default=None, metavar='URL',
- help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
+ help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental '
+ 'SOCKS proxy, specify a proper scheme. For example '
+ 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") '
+ 'for direct connection')
network.add_option(
'--socket-timeout',
dest='socket_timeout', type=float, default=None, metavar='SECONDS',
@@ -384,14 +395,18 @@ def parseOpts(overrideArguments=None):
downloader = optparse.OptionGroup(parser, 'Download Options')
downloader.add_option(
- '-r', '--rate-limit',
- dest='ratelimit', metavar='LIMIT',
+ '-r', '--limit-rate', '--rate-limit',
+ dest='ratelimit', metavar='RATE',
help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)')
downloader.add_option(
'-R', '--retries',
dest='retries', metavar='RETRIES', default=10,
help='Number of retries (default is %default), or "infinite".')
downloader.add_option(
+ '--fragment-retries',
+ dest='fragment_retries', metavar='RETRIES', default=10,
+ help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)')
+ downloader.add_option(
'--buffer-size',
dest='buffersize', metavar='SIZE', default='1024',
help='Size of download buffer (e.g. 1024 or 16K) (default is %default)')
@@ -413,8 +428,12 @@ def parseOpts(overrideArguments=None):
help='Set file xattribute ytdl.filesize with expected filesize (experimental)')
downloader.add_option(
'--hls-prefer-native',
- dest='hls_prefer_native', action='store_true',
- help='Use the native HLS downloader instead of ffmpeg (experimental)')
+ dest='hls_prefer_native', action='store_true', default=None,
+ help='Use the native HLS downloader instead of ffmpeg')
+ downloader.add_option(
+ '--hls-prefer-ffmpeg',
+ dest='hls_prefer_native', action='store_false', default=None,
+ help='Use ffmpeg instead of the native HLS downloader')
downloader.add_option(
'--hls-use-mpegts',
dest='hls_use_mpegts', action='store_true',
@@ -649,7 +668,7 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='writeannotations', default=False,
help='Write video annotations to a .annotations.xml file')
filesystem.add_option(
- '--load-info',
+ '--load-info-json', '--load-info',
dest='load_info_filename', metavar='FILE',
help='JSON file containing the video information (created with the "--write-info-json" option)')
filesystem.add_option(
@@ -712,7 +731,7 @@ def parseOpts(overrideArguments=None):
postproc.add_option(
'--embed-subs',
action='store_true', dest='embedsubtitles', default=False,
- help='Embed subtitles in the video (only for mkv and mp4 videos)')
+ help='Embed subtitles in the video (only for mp4, webm and mkv videos)')
postproc.add_option(
'--embed-thumbnail',
action='store_true', dest='embedthumbnail', default=False,
diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py
index 0d8ef6c..3ea5183 100644
--- a/youtube_dl/postprocessor/__init__.py
+++ b/youtube_dl/postprocessor/__init__.py
@@ -6,6 +6,7 @@ from .ffmpeg import (
FFmpegEmbedSubtitlePP,
FFmpegExtractAudioPP,
FFmpegFixupStretchedPP,
+ FFmpegFixupM3u8PP,
FFmpegFixupM4aPP,
FFmpegMergerPP,
FFmpegMetadataPP,
@@ -26,6 +27,7 @@ __all__ = [
'ExecAfterDownloadPP',
'FFmpegEmbedSubtitlePP',
'FFmpegExtractAudioPP',
+ 'FFmpegFixupM3u8PP',
'FFmpegFixupM4aPP',
'FFmpegFixupStretchedPP',
'FFmpegMergerPP',
diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py
index 74f66d6..90630c2 100644
--- a/youtube_dl/postprocessor/execafterdownload.py
+++ b/youtube_dl/postprocessor/execafterdownload.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import subprocess
from .common import PostProcessor
-from ..compat import shlex_quote
+from ..compat import compat_shlex_quote
from ..utils import PostProcessingError
@@ -17,7 +17,7 @@ class ExecAfterDownloadPP(PostProcessor):
if '{}' not in cmd:
cmd += ' {}'
- cmd = cmd.replace('{}', shlex_quote(information['filepath']))
+ cmd = cmd.replace('{}', compat_shlex_quote(information['filepath']))
self._downloader.to_screen('[exec] Executing command: %s' % cmd)
retCode = subprocess.call(cmd, shell=True)
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 380bc6f..fa99b0c 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -25,6 +25,19 @@ from ..utils import (
)
+EXT_TO_OUT_FORMATS = {
+ "aac": "adts",
+ "m4a": "ipod",
+ "mka": "matroska",
+ "mkv": "matroska",
+ "mpg": "mpeg",
+ "ogv": "ogg",
+ "ts": "mpegts",
+ "wma": "asf",
+ "wmv": "asf",
+}
+
+
class FFmpegPostProcessorError(PostProcessingError):
pass
@@ -162,7 +175,8 @@ class FFmpegPostProcessor(PostProcessor):
# Always use 'file:' because the filename may contain ':' (ffmpeg
# interprets that as a protocol) or can start with '-' (-- is broken in
# ffmpeg, see https://ffmpeg.org/trac/ffmpeg/ticket/2127 for details)
- return 'file:' + fn
+ # Also leave '-' intact in order not to break streaming to stdout.
+ return 'file:' + fn if fn != '-' else fn
class FFmpegExtractAudioPP(FFmpegPostProcessor):
@@ -318,17 +332,34 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor):
class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
def run(self, information):
- if information['ext'] not in ['mp4', 'mkv']:
- self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files')
+ if information['ext'] not in ('mp4', 'webm', 'mkv'):
+ self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files')
return [], information
subtitles = information.get('requested_subtitles')
if not subtitles:
self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed')
return [], information
- sub_langs = list(subtitles.keys())
filename = information['filepath']
- sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()]
+
+ ext = information['ext']
+ sub_langs = []
+ sub_filenames = []
+ webm_vtt_warn = False
+
+ for lang, sub_info in subtitles.items():
+ sub_ext = sub_info['ext']
+ if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
+ sub_langs.append(lang)
+ sub_filenames.append(subtitles_filename(filename, lang, sub_ext))
+ else:
+ if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt':
+ webm_vtt_warn = True
+ self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files')
+
+ if not sub_langs:
+ return [], information
+
input_files = [filename] + sub_filenames
opts = [
@@ -358,23 +389,30 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
class FFmpegMetadataPP(FFmpegPostProcessor):
def run(self, info):
metadata = {}
- if info.get('title') is not None:
- metadata['title'] = info['title']
- if info.get('upload_date') is not None:
- metadata['date'] = info['upload_date']
- if info.get('artist') is not None:
- metadata['artist'] = info['artist']
- elif info.get('uploader') is not None:
- metadata['artist'] = info['uploader']
- elif info.get('uploader_id') is not None:
- metadata['artist'] = info['uploader_id']
- if info.get('description') is not None:
- metadata['description'] = info['description']
- metadata['comment'] = info['description']
- if info.get('webpage_url') is not None:
- metadata['purl'] = info['webpage_url']
- if info.get('album') is not None:
- metadata['album'] = info['album']
+
+ def add(meta_list, info_list=None):
+ if not info_list:
+ info_list = meta_list
+ if not isinstance(meta_list, (list, tuple)):
+ meta_list = (meta_list,)
+ if not isinstance(info_list, (list, tuple)):
+ info_list = (info_list,)
+ for info_f in info_list:
+ if info.get(info_f) is not None:
+ for meta_f in meta_list:
+ metadata[meta_f] = info[info_f]
+ break
+
+ add('title', ('track', 'title'))
+ add('date', 'upload_date')
+ add(('description', 'comment'), 'description')
+ add('purl', 'webpage_url')
+ add('track', 'track_number')
+ add('artist', ('artist', 'creator', 'uploader', 'uploader_id'))
+ add('genre')
+ add('album')
+ add('album_artist')
+ add('disc', 'disc_number')
if not metadata:
self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
@@ -391,10 +429,6 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
- # https://github.com/rg3/youtube-dl/issues/8350
- if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False):
- options.extend(['-bsf:a', 'aac_adtstoasc'])
-
self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename))
@@ -467,6 +501,21 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor):
return [], info
+class FFmpegFixupM3u8PP(FFmpegPostProcessor):
+ def run(self, info):
+ filename = info['filepath']
+ temp_filename = prepend_extension(filename, 'temp')
+
+ options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
+ self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename)
+ self.run_ffmpeg(filename, temp_filename, options)
+
+ os.remove(encodeFilename(filename))
+ os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+ return [], info
+
+
class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
def __init__(self, downloader=None, format=None):
super(FFmpegSubtitlesConvertorPP, self).__init__(downloader)
@@ -495,7 +544,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
sub_filenames.append(old_file)
new_file = subtitles_filename(filename, lang, new_ext)
- if ext == 'dfxp' or ext == 'ttml':
+ if ext == 'dfxp' or ext == 'ttml' or ext == 'tt':
self._downloader.report_warning(
'You have requested to convert dfxp (TTML) subtitles into another format, '
'which results in style information loss')
diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py
index 480d48d..e39ca60 100644
--- a/youtube_dl/postprocessor/xattrpp.py
+++ b/youtube_dl/postprocessor/xattrpp.py
@@ -6,6 +6,7 @@ import sys
import errno
from .common import PostProcessor
+from ..compat import compat_os_name
from ..utils import (
check_executable,
hyphenate_date,
@@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor):
raise XAttrMetadataError(e.errno, e.strerror)
except ImportError:
- if os.name == 'nt':
+ if compat_os_name == 'nt':
# Write xattrs to NTFS Alternate Data Streams:
# http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
def write_xattr(path, key, value):
@@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor):
'Unable to write extended attributes due to too long values.')
else:
msg = 'This filesystem doesn\'t support extended attributes. '
- if os.name == 'nt':
+ if compat_os_name == 'nt':
msg += 'You need to use NTFS.'
else:
msg += '(You may have to enable them in your /etc/fstab)'
diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py
new file mode 100644
index 0000000..fd49d74
--- /dev/null
+++ b/youtube_dl/socks.py
@@ -0,0 +1,271 @@
+# Public Domain SOCKS proxy protocol implementation
+# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3
+
+from __future__ import unicode_literals
+
+# References:
+# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol
+# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol
+# SOCKS5 protocol https://tools.ietf.org/html/rfc1928
+# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929
+
+import collections
+import socket
+
+from .compat import (
+ compat_ord,
+ compat_struct_pack,
+ compat_struct_unpack,
+)
+
+__author__ = 'Timo Schmid <coding@timoschmid.de>'
+
+SOCKS4_VERSION = 4
+SOCKS4_REPLY_VERSION = 0x00
+# Excerpt from SOCKS4A protocol:
+# if the client cannot resolve the destination host's domain name to find its
+# IP address, it should set the first three bytes of DSTIP to NULL and the last
+# byte to a non-zero value.
+SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF)
+
+SOCKS5_VERSION = 5
+SOCKS5_USER_AUTH_VERSION = 0x01
+SOCKS5_USER_AUTH_SUCCESS = 0x00
+
+
+class Socks4Command(object):
+ CMD_CONNECT = 0x01
+ CMD_BIND = 0x02
+
+
+class Socks5Command(Socks4Command):
+ CMD_UDP_ASSOCIATE = 0x03
+
+
+class Socks5Auth(object):
+ AUTH_NONE = 0x00
+ AUTH_GSSAPI = 0x01
+ AUTH_USER_PASS = 0x02
+ AUTH_NO_ACCEPTABLE = 0xFF # For server response
+
+
+class Socks5AddressType(object):
+ ATYP_IPV4 = 0x01
+ ATYP_DOMAINNAME = 0x03
+ ATYP_IPV6 = 0x04
+
+
+class ProxyError(IOError):
+ ERR_SUCCESS = 0x00
+
+ def __init__(self, code=None, msg=None):
+ if code is not None and msg is None:
+ msg = self.CODES.get(code) and 'unknown error'
+ super(ProxyError, self).__init__(code, msg)
+
+
+class InvalidVersionError(ProxyError):
+ def __init__(self, expected_version, got_version):
+ msg = ('Invalid response version from server. Expected {0:02x} got '
+ '{1:02x}'.format(expected_version, got_version))
+ super(InvalidVersionError, self).__init__(0, msg)
+
+
+class Socks4Error(ProxyError):
+ ERR_SUCCESS = 90
+
+ CODES = {
+ 91: 'request rejected or failed',
+ 92: 'request rejected becasue SOCKS server cannot connect to identd on the client',
+ 93: 'request rejected because the client program and identd report different user-ids'
+ }
+
+
+class Socks5Error(ProxyError):
+ ERR_GENERAL_FAILURE = 0x01
+
+ CODES = {
+ 0x01: 'general SOCKS server failure',
+ 0x02: 'connection not allowed by ruleset',
+ 0x03: 'Network unreachable',
+ 0x04: 'Host unreachable',
+ 0x05: 'Connection refused',
+ 0x06: 'TTL expired',
+ 0x07: 'Command not supported',
+ 0x08: 'Address type not supported',
+ 0xFE: 'unknown username or invalid password',
+ 0xFF: 'all offered authentication methods were rejected'
+ }
+
+
+class ProxyType(object):
+ SOCKS4 = 0
+ SOCKS4A = 1
+ SOCKS5 = 2
+
+Proxy = collections.namedtuple('Proxy', (
+ 'type', 'host', 'port', 'username', 'password', 'remote_dns'))
+
+
+class sockssocket(socket.socket):
+ def __init__(self, *args, **kwargs):
+ self._proxy = None
+ super(sockssocket, self).__init__(*args, **kwargs)
+
+ def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None):
+ assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5)
+
+ self._proxy = Proxy(proxytype, addr, port, username, password, rdns)
+
+ def recvall(self, cnt):
+ data = b''
+ while len(data) < cnt:
+ cur = self.recv(cnt - len(data))
+ if not cur:
+ raise IOError('{0} bytes missing'.format(cnt - len(data)))
+ data += cur
+ return data
+
+ def _recv_bytes(self, cnt):
+ data = self.recvall(cnt)
+ return compat_struct_unpack('!{0}B'.format(cnt), data)
+
+ @staticmethod
+ def _len_and_data(data):
+ return compat_struct_pack('!B', len(data)) + data
+
+ def _check_response_version(self, expected_version, got_version):
+ if got_version != expected_version:
+ self.close()
+ raise InvalidVersionError(expected_version, got_version)
+
+ def _resolve_address(self, destaddr, default, use_remote_dns):
+ try:
+ return socket.inet_aton(destaddr)
+ except socket.error:
+ if use_remote_dns and self._proxy.remote_dns:
+ return default
+ else:
+ return socket.inet_aton(socket.gethostbyname(destaddr))
+
+ def _setup_socks4(self, address, is_4a=False):
+ destaddr, port = address
+
+ ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a)
+
+ packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr
+
+ username = (self._proxy.username or '').encode('utf-8')
+ packet += username + b'\x00'
+
+ if is_4a and self._proxy.remote_dns:
+ packet += destaddr.encode('utf-8') + b'\x00'
+
+ self.sendall(packet)
+
+ version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8))
+
+ self._check_response_version(SOCKS4_REPLY_VERSION, version)
+
+ if resp_code != Socks4Error.ERR_SUCCESS:
+ self.close()
+ raise Socks4Error(resp_code)
+
+ return (dsthost, dstport)
+
+ def _setup_socks4a(self, address):
+ self._setup_socks4(address, is_4a=True)
+
+ def _socks5_auth(self):
+ packet = compat_struct_pack('!B', SOCKS5_VERSION)
+
+ auth_methods = [Socks5Auth.AUTH_NONE]
+ if self._proxy.username and self._proxy.password:
+ auth_methods.append(Socks5Auth.AUTH_USER_PASS)
+
+ packet += compat_struct_pack('!B', len(auth_methods))
+ packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods)
+
+ self.sendall(packet)
+
+ version, method = self._recv_bytes(2)
+
+ self._check_response_version(SOCKS5_VERSION, version)
+
+ if method == Socks5Auth.AUTH_NO_ACCEPTABLE:
+ self.close()
+ raise Socks5Error(method)
+
+ if method == Socks5Auth.AUTH_USER_PASS:
+ username = self._proxy.username.encode('utf-8')
+ password = self._proxy.password.encode('utf-8')
+ packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION)
+ packet += self._len_and_data(username) + self._len_and_data(password)
+ self.sendall(packet)
+
+ version, status = self._recv_bytes(2)
+
+ self._check_response_version(SOCKS5_USER_AUTH_VERSION, version)
+
+ if status != SOCKS5_USER_AUTH_SUCCESS:
+ self.close()
+ raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE)
+
+ def _setup_socks5(self, address):
+ destaddr, port = address
+
+ ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True)
+
+ self._socks5_auth()
+
+ reserved = 0
+ packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved)
+ if ipaddr is None:
+ destaddr = destaddr.encode('utf-8')
+ packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME)
+ packet += self._len_and_data(destaddr)
+ else:
+ packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr
+ packet += compat_struct_pack('!H', port)
+
+ self.sendall(packet)
+
+ version, status, reserved, atype = self._recv_bytes(4)
+
+ self._check_response_version(SOCKS5_VERSION, version)
+
+ if status != Socks5Error.ERR_SUCCESS:
+ self.close()
+ raise Socks5Error(status)
+
+ if atype == Socks5AddressType.ATYP_IPV4:
+ destaddr = self.recvall(4)
+ elif atype == Socks5AddressType.ATYP_DOMAINNAME:
+ alen = compat_ord(self.recv(1))
+ destaddr = self.recvall(alen)
+ elif atype == Socks5AddressType.ATYP_IPV6:
+ destaddr = self.recvall(16)
+ destport = compat_struct_unpack('!H', self.recvall(2))[0]
+
+ return (destaddr, destport)
+
+ def _make_proxy(self, connect_func, address):
+ if not self._proxy:
+ return connect_func(self, address)
+
+ result = connect_func(self, (self._proxy.host, self._proxy.port))
+ if result != 0 and result is not None:
+ return result
+ setup_funcs = {
+ ProxyType.SOCKS4: self._setup_socks4,
+ ProxyType.SOCKS4A: self._setup_socks4a,
+ ProxyType.SOCKS5: self._setup_socks5,
+ }
+ setup_funcs[self._proxy.type](address)
+ return result
+
+ def connect(self, address):
+ self._make_proxy(socket.socket.connect, address)
+
+ def connect_ex(self, address):
+ return self._make_proxy(socket.socket.connect_ex, address)
diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py
index 06c1d6c..7cf490a 100644
--- a/youtube_dl/swfinterp.py
+++ b/youtube_dl/swfinterp.py
@@ -4,10 +4,12 @@ import collections
import io
import zlib
-from .compat import compat_str
+from .compat import (
+ compat_str,
+ compat_struct_unpack,
+)
from .utils import (
ExtractorError,
- struct_unpack,
)
@@ -23,17 +25,17 @@ def _extract_tags(file_contents):
file_contents[:1])
# Determine number of bits in framesize rectangle
- framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3
+ framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3
framesize_len = (5 + 4 * framesize_nbits + 7) // 8
pos = framesize_len + 2 + 2
while pos < len(content):
- header16 = struct_unpack('<H', content[pos:pos + 2])[0]
+ header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0]
pos += 2
tag_code = header16 >> 6
tag_len = header16 & 0x3f
if tag_len == 0x3f:
- tag_len = struct_unpack('<I', content[pos:pos + 4])[0]
+ tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0]
pos += 4
assert pos + tag_len <= len(content), \
('Tag %d ends at %d+%d - that\'s longer than the file (%d)'
@@ -101,7 +103,7 @@ def _read_int(reader):
for _ in range(5):
buf = reader.read(1)
assert len(buf) == 1
- b = struct_unpack('<B', buf)[0]
+ b = compat_struct_unpack('<B', buf)[0]
res = res | ((b & 0x7f) << shift)
if b & 0x80 == 0:
break
@@ -127,7 +129,7 @@ def _s24(reader):
bs = reader.read(3)
assert len(bs) == 3
last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00'
- return struct_unpack('<i', bs + last_byte)[0]
+ return compat_struct_unpack('<i', bs + last_byte)[0]
def _read_string(reader):
@@ -146,7 +148,7 @@ def _read_bytes(count, reader):
def _read_byte(reader):
resb = _read_bytes(1, reader=reader)
- res = struct_unpack('<B', resb)[0]
+ res = compat_struct_unpack('<B', resb)[0]
return res
diff --git a/youtube_dl/update.py b/youtube_dl/update.py
index 676ebe1..ebce966 100644
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -83,11 +83,8 @@ def update_self(to_screen, verbose, opener):
print_notes(to_screen, versions_info['versions'])
- filename = sys.argv[0]
- # Py2EXE: Filename could be different
- if hasattr(sys, 'frozen') and not os.path.isfile(filename):
- if os.path.isfile(filename + '.exe'):
- filename += '.exe'
+ # sys.executable is set to the full pathname of the exe-file for py2exe
+ filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0]
if not os.access(filename, os.W_OK):
to_screen('ERROR: no write permissions on %s' % filename)
@@ -95,7 +92,7 @@ def update_self(to_screen, verbose, opener):
# Py2EXE
if hasattr(sys, 'frozen'):
- exe = os.path.abspath(filename)
+ exe = filename
directory = os.path.dirname(exe)
if not os.access(directory, os.W_OK):
to_screen('ERROR: no write permissions on %s' % directory)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 6978a10..562031f 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -14,8 +14,8 @@ import email.utils
import errno
import functools
import gzip
-import itertools
import io
+import itertools
import json
import locale
import math
@@ -24,9 +24,8 @@ import os
import pipes
import platform
import re
-import ssl
import socket
-import struct
+import ssl
import subprocess
import sys
import tempfile
@@ -35,29 +34,49 @@ import xml.etree.ElementTree
import zlib
from .compat import (
+ compat_HTMLParser,
compat_basestring,
compat_chr,
compat_etree_fromstring,
compat_html_entities,
+ compat_html_entities_html5,
compat_http_client,
compat_kwargs,
compat_parse_qs,
+ compat_shlex_quote,
compat_socket_create_connection,
compat_str,
+ compat_struct_pack,
compat_urllib_error,
compat_urllib_parse,
+ compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
+ compat_urllib_parse_unquote_plus,
compat_urllib_request,
compat_urlparse,
- shlex_quote,
+ compat_xpath,
)
+from .socks import (
+ ProxyType,
+ sockssocket,
+)
+
+
+def register_socks_protocols():
+ # "Register" SOCKS protocols
+ # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
+ # URLs with protocols not in urlparse.uses_netloc are not handled correctly
+ for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
+ if scheme not in compat_urlparse.uses_netloc:
+ compat_urlparse.uses_netloc.append(scheme)
+
# This is not clearly defined otherwise
compiled_regex_type = type(re.compile(''))
std_headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
@@ -86,6 +105,11 @@ KNOWN_EXTENSIONS = (
'wav',
'f4f', 'f4m', 'm3u8', 'smil')
+# needed for sanitizing filenames in restricted mode
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+ itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
+ 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
+
def preferredencoding():
"""Get preferred encoding.
@@ -160,18 +184,11 @@ if sys.version_info >= (2, 7):
def find_xpath_attr(node, xpath, key, val=None):
""" Find the xpath xpath[@key=val] """
assert re.match(r'^[a-zA-Z_-]+$', key)
- if val:
- assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
return node.find(expr)
else:
def find_xpath_attr(node, xpath, key, val=None):
- # Here comes the crazy part: In 2.6, if the xpath is a unicode,
- # .//node does not match if a node is a direct child of . !
- if isinstance(xpath, compat_str):
- xpath = xpath.encode('ascii')
-
- for f in node.findall(xpath):
+ for f in node.findall(compat_xpath(xpath)):
if key not in f.attrib:
continue
if val is None or f.attrib.get(key) == val:
@@ -196,9 +213,7 @@ def xpath_with_ns(path, ns_map):
def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
def _find_xpath(xpath):
- if sys.version_info < (2, 7): # Crazy 2.6
- xpath = xpath.encode('ascii')
- return node.find(xpath)
+ return node.find(compat_xpath(xpath))
if isinstance(xpath, (str, compat_str)):
n = _find_xpath(xpath)
@@ -257,9 +272,9 @@ def get_element_by_attribute(attribute, value, html):
m = re.search(r'''(?xs)
<([a-zA-Z0-9:._-]+)
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
\s+%s=['"]?%s['"]?
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
\s*>
(?P<content>.*?)
</\1>
@@ -275,6 +290,38 @@ def get_element_by_attribute(attribute, value, html):
return unescapeHTML(res)
+class HTMLAttributeParser(compat_HTMLParser):
+ """Trivial HTML parser to gather the attributes for a single element"""
+ def __init__(self):
+ self.attrs = {}
+ compat_HTMLParser.__init__(self)
+
+ def handle_starttag(self, tag, attrs):
+ self.attrs = dict(attrs)
+
+
+def extract_attributes(html_element):
+ """Given a string for an HTML element such as
+ <el
+ a="foo" B="bar" c="&98;az" d=boz
+ empty= noval entity="&amp;"
+ sq='"' dq="'"
+ >
+ Decode and return a dictionary of attributes.
+ {
+ 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
+ 'empty': '', 'noval': None, 'entity': '&',
+ 'sq': '"', 'dq': '\''
+ }.
+ NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
+ but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
+ """
+ parser = HTMLAttributeParser()
+ parser.feed(html_element)
+ parser.close()
+ return parser.attrs
+
+
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
@@ -339,6 +386,8 @@ def sanitize_filename(s, restricted=False, is_id=False):
Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
"""
def replace_insane(char):
+ if restricted and char in ACCENT_CHARS:
+ return ACCENT_CHARS[char]
if char == '?' or ord(char) < 32 or ord(char) == 127:
return ''
elif char == '"':
@@ -391,9 +440,12 @@ def sanitize_path(s):
# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
# unwanted failures due to missing protocol
+def sanitize_url(url):
+ return 'http:%s' % url if url.startswith('//') else url
+
+
def sanitized_Request(url, *args, **kwargs):
- return compat_urllib_request.Request(
- 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+ return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
def orderedSet(iterable):
@@ -405,12 +457,19 @@ def orderedSet(iterable):
return res
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
"""Transforms an HTML entity to a character."""
+ entity = entity_with_semicolon[:-1]
+
# Known non-numeric HTML entity
if entity in compat_html_entities.name2codepoint:
return compat_chr(compat_html_entities.name2codepoint[entity])
+ # TODO: HTML5 allows entities without a semicolon. For example,
+ # '&Eacuteric' should be decoded as 'Éric'.
+ if entity_with_semicolon in compat_html_entities_html5:
+ return compat_html_entities_html5[entity_with_semicolon]
+
mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
if mobj is not None:
numstr = mobj.group(1)
@@ -435,7 +494,7 @@ def unescapeHTML(s):
assert type(s) == compat_str
return re.sub(
- r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+ r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def get_subprocess_encoding():
@@ -467,6 +526,10 @@ def encodeFilename(s, for_subprocess=False):
if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
return s
+ # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
+ if sys.platform.startswith('java'):
+ return s
+
return s.encode(get_subprocess_encoding(), 'ignore')
@@ -712,8 +775,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
self._params = params
def http_open(self, req):
+ conn_class = compat_http_client.HTTPConnection
+
+ socks_proxy = req.headers.get('Ytdl-socks-proxy')
+ if socks_proxy:
+ conn_class = make_socks_conn_class(conn_class, socks_proxy)
+ del req.headers['Ytdl-socks-proxy']
+
return self.do_open(functools.partial(
- _create_http_connection, self, compat_http_client.HTTPConnection, False),
+ _create_http_connection, self, conn_class, False),
req)
@staticmethod
@@ -745,12 +815,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
# Substitute URL if any change after escaping
if url != url_escaped:
- req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
- new_req = req_type(
- url_escaped, data=req.data, headers=req.headers,
- origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
- new_req.timeout = req.timeout
- req = new_req
+ req = update_Request(req, url=url_escaped)
for h, v in std_headers.items():
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
@@ -804,9 +869,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
if sys.version_info >= (3, 0):
location = location.encode('iso-8859-1').decode('utf-8')
+ else:
+ location = location.decode('utf-8')
location_escaped = escape_url(location)
if location != location_escaped:
del resp.headers['Location']
+ if sys.version_info < (3, 0):
+ location_escaped = location_escaped.encode('utf-8')
resp.headers['Location'] = location_escaped
return resp
@@ -814,6 +883,49 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
https_response = http_response
+def make_socks_conn_class(base_class, socks_proxy):
+ assert issubclass(base_class, (
+ compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
+
+ url_components = compat_urlparse.urlparse(socks_proxy)
+ if url_components.scheme.lower() == 'socks5':
+ socks_type = ProxyType.SOCKS5
+ elif url_components.scheme.lower() in ('socks', 'socks4'):
+ socks_type = ProxyType.SOCKS4
+ elif url_components.scheme.lower() == 'socks4a':
+ socks_type = ProxyType.SOCKS4A
+
+ def unquote_if_non_empty(s):
+ if not s:
+ return s
+ return compat_urllib_parse_unquote_plus(s)
+
+ proxy_args = (
+ socks_type,
+ url_components.hostname, url_components.port or 1080,
+ True, # Remote DNS
+ unquote_if_non_empty(url_components.username),
+ unquote_if_non_empty(url_components.password),
+ )
+
+ class SocksConnection(base_class):
+ def connect(self):
+ self.sock = sockssocket()
+ self.sock.setproxy(*proxy_args)
+ if type(self.timeout) in (int, float):
+ self.sock.settimeout(self.timeout)
+ self.sock.connect((self.host, self.port))
+
+ if isinstance(self, compat_http_client.HTTPSConnection):
+ if hasattr(self, '_context'): # Python > 2.6
+ self.sock = self._context.wrap_socket(
+ self.sock, server_hostname=self.host)
+ else:
+ self.sock = ssl.wrap_socket(self.sock)
+
+ return SocksConnection
+
+
class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
def __init__(self, params, https_conn_class=None, *args, **kwargs):
compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
@@ -822,12 +934,20 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
def https_open(self, req):
kwargs = {}
+ conn_class = self._https_conn_class
+
if hasattr(self, '_context'): # python > 2.6
kwargs['context'] = self._context
if hasattr(self, '_check_hostname'): # python 3.x
kwargs['check_hostname'] = self._check_hostname
+
+ socks_proxy = req.headers.get('Ytdl-socks-proxy')
+ if socks_proxy:
+ conn_class = make_socks_conn_class(conn_class, socks_proxy)
+ del req.headers['Ytdl-socks-proxy']
+
return self.do_open(functools.partial(
- _create_http_connection, self, self._https_conn_class, True),
+ _create_http_connection, self, conn_class, True),
req, **kwargs)
@@ -905,9 +1025,9 @@ def unified_strdate(date_str, day_first=True):
'%d %b %Y',
'%B %d %Y',
'%b %d %Y',
- '%b %dst %Y %I:%M%p',
- '%b %dnd %Y %I:%M%p',
- '%b %dth %Y %I:%M%p',
+ '%b %dst %Y %I:%M',
+ '%b %dnd %Y %I:%M',
+ '%b %dth %Y %I:%M',
'%Y %m %d',
'%Y-%m-%d',
'%Y/%m/%d',
@@ -927,6 +1047,7 @@ def unified_strdate(date_str, day_first=True):
format_expressions.extend([
'%d-%m-%Y',
'%d.%m.%Y',
+ '%d.%m.%y',
'%d/%m/%Y',
'%d/%m/%y',
'%d/%m/%Y %H:%M:%S',
@@ -947,7 +1068,10 @@ def unified_strdate(date_str, day_first=True):
if upload_date is None:
timetuple = email.utils.parsedate_tz(date_str)
if timetuple:
- upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+ try:
+ upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+ except ValueError:
+ pass
if upload_date is not None:
return compat_str(upload_date)
@@ -1158,7 +1282,7 @@ def bytes_to_intlist(bs):
def intlist_to_bytes(xs):
if not xs:
return b''
- return struct_pack('%dB' % len(xs), *xs)
+ return compat_struct_pack('%dB' % len(xs), *xs)
# Cross-platform file locking
@@ -1217,13 +1341,23 @@ if sys.platform == 'win32':
raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
else:
- import fcntl
+ # Some platforms, such as Jython, is missing fcntl
+ try:
+ import fcntl
- def _lock_file(f, exclusive):
- fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
+ def _lock_file(f, exclusive):
+ fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
- def _unlock_file(f):
- fcntl.flock(f, fcntl.LOCK_UN)
+ def _unlock_file(f):
+ fcntl.flock(f, fcntl.LOCK_UN)
+ except ImportError:
+ UNSUPPORTED_MSG = 'file locking is not supported on this platform'
+
+ def _lock_file(f, exclusive):
+ raise IOError(UNSUPPORTED_MSG)
+
+ def _unlock_file(f):
+ raise IOError(UNSUPPORTED_MSG)
class locked_file(object):
@@ -1276,7 +1410,7 @@ def shell_quote(args):
def smuggle_url(url, data):
""" Pass additional data in a URL for internal use. """
- sdata = compat_urllib_parse.urlencode(
+ sdata = compat_urllib_parse_urlencode(
{'__youtubedl_smuggle': json.dumps(data)})
return url + '#' + sdata
@@ -1304,6 +1438,17 @@ def format_bytes(bytes):
return '%.2f%s' % (converted, suffix)
+def lookup_unit_table(unit_table, s):
+ units_re = '|'.join(re.escape(u) for u in unit_table)
+ m = re.match(
+ r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
+ if not m:
+ return None
+ num_str = m.group('num').replace(',', '.')
+ mult = unit_table[m.group('unit')]
+ return int(float(num_str) * mult)
+
+
def parse_filesize(s):
if s is None:
return None
@@ -1347,15 +1492,28 @@ def parse_filesize(s):
'Yb': 1000 ** 8,
}
- units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
- m = re.match(
- r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
- if not m:
+ return lookup_unit_table(_UNIT_TABLE, s)
+
+
+def parse_count(s):
+ if s is None:
return None
- num_str = m.group('num').replace(',', '.')
- mult = _UNIT_TABLE[m.group('unit')]
- return int(float(num_str) * mult)
+ s = s.strip()
+
+ if re.match(r'^[\d,.]+$', s):
+ return str_to_int(s)
+
+ _UNIT_TABLE = {
+ 'k': 1000,
+ 'K': 1000,
+ 'm': 1000 ** 2,
+ 'M': 1000 ** 2,
+ 'kk': 1000 ** 2,
+ 'KK': 1000 ** 2,
+ }
+
+ return lookup_unit_table(_UNIT_TABLE, s)
def month_by_name(name):
@@ -1387,6 +1545,12 @@ def fix_xml_ampersands(xml_str):
def setproctitle(title):
assert isinstance(title, compat_str)
+
+ # ctypes in Jython is not complete
+ # http://bugs.jython.org/issue2148
+ if sys.platform.startswith('java'):
+ return
+
try:
libc = ctypes.cdll.LoadLibrary('libc.so.6')
except OSError:
@@ -1401,15 +1565,11 @@ def setproctitle(title):
def remove_start(s, start):
- if s.startswith(start):
- return s[len(start):]
- return s
+ return s[len(start):] if s is not None and s.startswith(start) else s
def remove_end(s, end):
- if s.endswith(end):
- return s[:-len(end)]
- return s
+ return s[:-len(end)] if s is not None and s.endswith(end) else s
def remove_quotes(s):
@@ -1472,44 +1632,46 @@ def parse_duration(s):
s = s.strip()
- m = re.match(
- r'''(?ix)(?:P?T)?
- (?:
- (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
- (?P<only_hours>[0-9.]+)\s*(?:hours?)|
-
- \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
- (?:
+ days, hours, mins, secs, ms = [None] * 5
+ m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
+ if m:
+ days, hours, mins, secs, ms = m.groups()
+ else:
+ m = re.match(
+ r'''(?ix)(?:P?T)?
(?:
- (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
- (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
+ (?P<days>[0-9]+)\s*d(?:ays?)?\s*
)?
- (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
- )?
- (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
- )$''', s)
- if not m:
- return None
- res = 0
- if m.group('only_mins'):
- return float_or_none(m.group('only_mins'), invscale=60)
- if m.group('only_hours'):
- return float_or_none(m.group('only_hours'), invscale=60 * 60)
- if m.group('secs'):
- res += int(m.group('secs'))
- if m.group('mins_reversed'):
- res += int(m.group('mins_reversed')) * 60
- if m.group('mins'):
- res += int(m.group('mins')) * 60
- if m.group('hours'):
- res += int(m.group('hours')) * 60 * 60
- if m.group('hours_reversed'):
- res += int(m.group('hours_reversed')) * 60 * 60
- if m.group('days'):
- res += int(m.group('days')) * 24 * 60 * 60
- if m.group('ms'):
- res += float(m.group('ms'))
- return res
+ (?:
+ (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
+ )?
+ (?:
+ (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
+ )?
+ (?:
+ (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
+ )?$''', s)
+ if m:
+ days, hours, mins, secs, ms = m.groups()
+ else:
+ m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
+ if m:
+ hours, mins = m.groups()
+ else:
+ return None
+
+ duration = 0
+ if secs:
+ duration += float(secs)
+ if mins:
+ duration += float(mins) * 60
+ if hours:
+ duration += float(hours) * 60 * 60
+ if days:
+ duration += float(days) * 24 * 60 * 60
+ if ms:
+ duration += float(ms)
+ return duration
def prepend_extension(filename, ext, expected_real_ext=None):
@@ -1570,9 +1732,12 @@ class PagedList(object):
class OnDemandPagedList(PagedList):
- def __init__(self, pagefunc, pagesize):
+ def __init__(self, pagefunc, pagesize, use_cache=False):
self._pagefunc = pagefunc
self._pagesize = pagesize
+ self._use_cache = use_cache
+ if use_cache:
+ self._cache = {}
def getslice(self, start=0, end=None):
res = []
@@ -1582,7 +1747,13 @@ class OnDemandPagedList(PagedList):
if start >= nextfirstid:
continue
- page_results = list(self._pagefunc(pagenum))
+ page_results = None
+ if self._use_cache:
+ page_results = self._cache.get(pagenum)
+ if page_results is None:
+ page_results = list(self._pagefunc(pagenum))
+ if self._use_cache:
+ self._cache[pagenum] = page_results
startv = (
start % self._pagesize
@@ -1668,29 +1839,13 @@ def escape_url(url):
"""Escape URL as suggested by RFC 3986"""
url_parsed = compat_urllib_parse_urlparse(url)
return url_parsed._replace(
+ netloc=url_parsed.netloc.encode('idna').decode('ascii'),
path=escape_rfc3986(url_parsed.path),
params=escape_rfc3986(url_parsed.params),
query=escape_rfc3986(url_parsed.query),
fragment=escape_rfc3986(url_parsed.fragment)
).geturl()
-try:
- struct.pack('!I', 0)
-except TypeError:
- # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
- def struct_pack(spec, *args):
- if isinstance(spec, compat_str):
- spec = spec.encode('ascii')
- return struct.pack(spec, *args)
-
- def struct_unpack(spec, *args):
- if isinstance(spec, compat_str):
- spec = spec.encode('ascii')
- return struct.unpack(spec, *args)
-else:
- struct_pack = struct.pack
- struct_unpack = struct.unpack
-
def read_batch_urls(batch_fd):
def fixup(url):
@@ -1709,13 +1864,31 @@ def read_batch_urls(batch_fd):
def urlencode_postdata(*args, **kargs):
- return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+ return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
-def encode_dict(d, encoding='utf-8'):
- def encode(v):
- return v.encode(encoding) if isinstance(v, compat_basestring) else v
- return dict((encode(k), encode(v)) for k, v in d.items())
+def update_url_query(url, query):
+ if not query:
+ return url
+ parsed_url = compat_urlparse.urlparse(url)
+ qs = compat_parse_qs(parsed_url.query)
+ qs.update(query)
+ return compat_urlparse.urlunparse(parsed_url._replace(
+ query=compat_urllib_parse_urlencode(qs, True)))
+
+
+def update_Request(req, url=None, data=None, headers={}, query={}):
+ req_headers = req.headers.copy()
+ req_headers.update(headers)
+ req_data = data or req.data
+ req_url = update_url_query(url or req.get_full_url(), query)
+ req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+ new_req = req_type(
+ req_url, data=req_data, headers=req_headers,
+ origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+ if hasattr(req, 'timeout'):
+ new_req.timeout = req.timeout
+ return new_req
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
@@ -1728,6 +1901,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
return d.get(key_or_keys, default)
+def try_get(src, getter, expected_type=None):
+ try:
+ v = getter(src)
+ except (AttributeError, KeyError, TypeError, IndexError):
+ pass
+ else:
+ if expected_type is None or isinstance(v, expected_type):
+ return v
+
+
def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
@@ -1750,7 +1933,7 @@ def parse_age_limit(s):
def strip_jsonp(code):
return re.sub(
- r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+ r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
def js_to_json(code):
@@ -1758,24 +1941,38 @@ def js_to_json(code):
v = m.group(0)
if v in ('true', 'false', 'null'):
return v
- if v.startswith('"'):
- v = re.sub(r"\\'", "'", v[1:-1])
- elif v.startswith("'"):
- v = v[1:-1]
- v = re.sub(r"\\\\|\\'|\"", lambda m: {
- '\\\\': '\\\\',
- "\\'": "'",
+ elif v.startswith('/*') or v == ',':
+ return ""
+
+ if v[0] in ("'", '"'):
+ v = re.sub(r'(?s)\\.|"', lambda m: {
'"': '\\"',
- }[m.group(0)], v)
+ "\\'": "'",
+ '\\\n': '',
+ '\\x': '\\u00',
+ }.get(m.group(0), m.group(0)), v[1:-1])
+
+ INTEGER_TABLE = (
+ (r'^0[xX][0-9a-fA-F]+', 16),
+ (r'^0+[0-7]+', 8),
+ )
+
+ for regex, base in INTEGER_TABLE:
+ im = re.match(regex, v)
+ if im:
+ i = int(im.group(0), base)
+ return '"%d":' % i if v.endswith(':') else '%d' % i
+
return '"%s"' % v
- res = re.sub(r'''(?x)
- "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
- '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
- [a-zA-Z_][.a-zA-Z_0-9]*
+ return re.sub(r'''(?sx)
+ "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
+ '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
+ /\*.*?\*/|,(?=\s*[\]}])|
+ [a-zA-Z_][.a-zA-Z_0-9]*|
+ \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
+ [0-9]+(?=\s*:)
''', fix_kv, code)
- res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
- return res
def qualities(quality_ids):
@@ -1823,7 +2020,7 @@ def ytdl_is_updateable():
def args_to_str(args):
# Get a short string representation for a subprocess command
- return ' '.join(shlex_quote(a) for a in args)
+ return ' '.join(compat_shlex_quote(a) for a in args)
def error_to_compat_str(err):
@@ -1836,8 +2033,14 @@ def error_to_compat_str(err):
def mimetype2ext(mt):
+ if mt is None:
+ return None
+
ext = {
'audio/mp4': 'm4a',
+ # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+ # it's the most popular one
+ 'audio/mpeg': 'mp3',
}.get(mt)
if ext is not None:
return ext
@@ -1858,11 +2061,7 @@ def mimetype2ext(mt):
def urlhandle_detect_ext(url_handle):
- try:
- url_handle.headers
- getheader = lambda h: url_handle.headers[h]
- except AttributeError: # Python < 3
- getheader = url_handle.info().getheader
+ getheader = url_handle.headers.get
cd = getheader('Content-Disposition')
if cd:
@@ -2036,6 +2235,7 @@ def dfxp2srt(dfxp_data):
_x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+ 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
})
class TTMLPElementParser(object):
@@ -2062,7 +2262,7 @@ def dfxp2srt(dfxp_data):
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
@@ -2591,6 +2791,10 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
if proxy == '__noproxy__':
return None # No Proxy
+ if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
+ req.add_header('Ytdl-socks-proxy', proxy)
+ # youtube-dl's http/https handlers do wrapping the socket with socks
+ return None
return compat_urllib_request.ProxyHandler.proxy_open(
self, req, proxy, type)
@@ -2610,3 +2814,50 @@ def ohdave_rsa_encrypt(data, exponent, modulus):
payload = int(binascii.hexlify(data[::-1]), 16)
encrypted = pow(payload, exponent, modulus)
return '%x' % encrypted
+
+
+def encode_base_n(num, n, table=None):
+ FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ if not table:
+ table = FULL_TABLE[:n]
+
+ if n > len(table):
+ raise ValueError('base %d exceeds table length %d' % (n, len(table)))
+
+ if num == 0:
+ return table[0]
+
+ ret = ''
+ while num:
+ ret = table[num % n] + ret
+ num = num // n
+ return ret
+
+
+def decode_packed_codes(code):
+ mobj = re.search(
+ r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
+ code)
+ obfucasted_code, base, count, symbols = mobj.groups()
+ base = int(base)
+ count = int(count)
+ symbols = symbols.split('|')
+ symbol_table = {}
+
+ while count:
+ count -= 1
+ base_n_count = encode_base_n(count, base)
+ symbol_table[base_n_count] = symbols[count] or base_n_count
+
+ return re.sub(
+ r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
+ obfucasted_code)
+
+
+def parse_m3u8_attributes(attrib):
+ info = {}
+ for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
+ if val.startswith('"'):
+ val = val[1:-1]
+ info[key] = val
+ return info
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 7a3df6a..2b7a4c9 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2016.02.22'
+__version__ = '2016.06.25'