From 47e0cef46e9a76e589a2b1cde1b2dfa8bcf01d8e Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sun, 16 Apr 2017 00:34:34 +0200 Subject: [openload] rewrite extractor --- youtube_dl/extractor/openload.py | 118 +++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index d8036b5..789bf99 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,12 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import os import re +import subprocess +import tempfile from .common import InfoExtractor -from ..compat import compat_chr from ..utils import ( + check_executable, determine_ext, + encodeArgument, ExtractorError, ) @@ -58,6 +62,39 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] + _PHANTOMJS_SCRIPT = r''' + phantom.onError = function(msg, trace) { + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) { + msgStack.push('TRACE:'); + trace.forEach(function(t) { + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }); + } + console.error(msgStack.join('\n')); + phantom.exit(1); + }; + var page = require('webpage').create(); + page.settings.resourceTimeout = 10000; + page.onInitialized = function() { + page.evaluate(function() { + delete window._phantom; + delete window.callPhantom; + }); + }; + page.open('https://openload.co/embed/%s/', function(status) { + var info = page.evaluate(function() { + return { + decoded_id: document.getElementById('streamurl').innerHTML, + title: document.querySelector('meta[name="og:title"],' + + 'meta[name=description]').content + }; + }); + console.log(info.decoded_id + ' ' + info.title); + phantom.exit(); + });''' + @staticmethod def _extract_urls(webpage): return re.findall( @@ -65,61 +102,48 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): + exe = check_executable('phantomjs', ['-v']) + if not exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + video_id = self._match_id(url) - webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) + url = 'https://openload.co/embed/%s/' % video_id + webpage = self._download_webpage(url, video_id) if 'File not found' in webpage or 'deleted by the owner' in webpage: - raise ExtractorError('File not found', expected=True) - - ol_id = self._search_regex( - ']+id="[^"]+"[^>]*>([0-9A-Za-z]+)', - webpage, 'openload ID') - - decoded = '' - a = ol_id[0:24] - b = [] - for i in range(0, len(a), 8): - b.append(int(a[i:i + 8] or '0', 16)) - ol_id = ol_id[24:] - j = 0 - k = 0 - while j < len(ol_id): - c = 128 - d = 0 - e = 0 - f = 0 - _more = True - while _more: - if j + 1 >= len(ol_id): - c = 143 - f = int(ol_id[j:j + 2] or '0', 16) - j += 2 - d += (f & 127) << e - e += 7 - _more = f >= c - g = d ^ b[k % 3] - for i in range(4): - char_dec = (g >> 8 * i) & (c + 127) - char = compat_chr(char_dec) - if char != '#': - decoded += char - k += 1 - - video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % decoded - - title = self._og_search_title(webpage, default=None) or self._search_regex( - r']+class=["\']title["\'][^>]*>([^<]+)', webpage, - 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', fatal=True) + raise ExtractorError('File not found', expected=True, video_id=video_id) + + script_file = tempfile.NamedTemporaryFile(mode='w', delete=False) + + # write JS script to file and close it + with script_file: + script_file.write(self._PHANTOMJS_SCRIPT % video_id) + + self.to_screen('%s: Decoding video ID with PhantomJS' % video_id) + + p = subprocess.Popen([exe, '--ssl-protocol=any', script_file.name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Decoding failed\n:' + + encodeArgument(err)) + else: + decoded_id, title = encodeArgument(output).strip().split(' ', 1) + + os.remove(script_file.name) + + video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id entries = self._parse_html5_media_entries(url, webpage, video_id) - subtitles = entries[0]['subtitles'] if entries else None + entry = entries[0] if entries else {} + subtitles = entry.get('subtitles') info_dict = { 'id': video_id, 'title': title, - 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), -- cgit v1.1 From da57ebaf84225240b356530cdf02d12596f0dce8 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 25 Apr 2017 01:06:14 +0200 Subject: [openload] separate PhantomJS code from extractor --- youtube_dl/extractor/openload.py | 78 +++++----------------- youtube_dl/utils.py | 141 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 789bf99..ac5e0bb 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import os import re -import subprocess -import tempfile from .common import InfoExtractor from ..utils import ( - check_executable, determine_ext, - encodeArgument, ExtractorError, + get_element_by_id, + PhantomJSwrapper, ) @@ -62,38 +59,7 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] - _PHANTOMJS_SCRIPT = r''' - phantom.onError = function(msg, trace) { - var msgStack = ['PHANTOM ERROR: ' + msg]; - if(trace && trace.length) { - msgStack.push('TRACE:'); - trace.forEach(function(t) { - msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line - + (t.function ? ' (in function ' + t.function +')' : '')); - }); - } - console.error(msgStack.join('\n')); - phantom.exit(1); - }; - var page = require('webpage').create(); - page.settings.resourceTimeout = 10000; - page.onInitialized = function() { - page.evaluate(function() { - delete window._phantom; - delete window.callPhantom; - }); - }; - page.open('https://openload.co/embed/%s/', function(status) { - var info = page.evaluate(function() { - return { - decoded_id: document.getElementById('streamurl').innerHTML, - title: document.querySelector('meta[name="og:title"],' - + 'meta[name=description]').content - }; - }); - console.log(info.decoded_id + ' ' + info.title); - phantom.exit(); - });''' + _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' @staticmethod def _extract_urls(webpage): @@ -102,40 +68,27 @@ class OpenloadIE(InfoExtractor): webpage) def _real_extract(self, url): - exe = check_executable('phantomjs', ['-v']) - if not exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) - video_id = self._match_id(url) url = 'https://openload.co/embed/%s/' % video_id - webpage = self._download_webpage(url, video_id) + headers = { + 'User-Agent': self._USER_AGENT, + } + + phantom = PhantomJSwrapper(self) + webpage, _ = phantom.get(url, video_id=video_id, headers=headers) if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) - script_file = tempfile.NamedTemporaryFile(mode='w', delete=False) - - # write JS script to file and close it - with script_file: - script_file.write(self._PHANTOMJS_SCRIPT % video_id) - - self.to_screen('%s: Decoding video ID with PhantomJS' % video_id) - - p = subprocess.Popen([exe, '--ssl-protocol=any', script_file.name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output, err = p.communicate() - if p.returncode != 0: - raise ExtractorError('Decoding failed\n:' - + encodeArgument(err)) - else: - decoded_id, title = encodeArgument(output).strip().split(' ', 1) - - os.remove(script_file.name) + decoded_id = get_element_by_id('streamurl', webpage) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + title = self._og_search_title(webpage, default=None) or self._search_regex( + r']+class=["\']title["\'][^>]*>([^<]+)', webpage, + 'title', default=None) or self._html_search_meta( + 'description', webpage, 'title', fatal=True) + entries = self._parse_html5_media_entries(url, webpage, video_id) entry = entries[0] if entries else {} subtitles = entry.get('subtitles') @@ -148,5 +101,6 @@ class OpenloadIE(InfoExtractor): # Seems all videos have extensions in their titles 'ext': determine_ext(title, 'mp4'), 'subtitles': subtitles, + 'http_headers': headers, } return info_dict diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2340bc3..94e1b07 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3652,3 +3652,144 @@ def write_xattr(path, key, value): "Couldn't find a tool to set the xattrs. " "Install either the python 'xattr' module, " "or the 'xattr' binary.") + + +class PhantomJSwrapper(object): + """PhantomJS wrapper class""" + + _TEMPLATE = r''' + phantom.onError = function(msg, trace) {{ + var msgStack = ['PHANTOM ERROR: ' + msg]; + if(trace && trace.length) {{ + msgStack.push('TRACE:'); + trace.forEach(function(t) {{ + msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + + (t.function ? ' (in function ' + t.function +')' : '')); + }}); + }} + console.error(msgStack.join('\n')); + phantom.exit(1); + }}; + var page = require('webpage').create(); + var fs = require('fs'); + var read = {{ mode: 'r', charset: 'utf-8' }}; + var write = {{ mode: 'w', charset: 'utf-8' }}; + page.settings.resourceTimeout = {timeout}; + page.settings.userAgent = "{ua}"; + page.onLoadStarted = function() {{ + page.evaluate(function() {{ + delete window._phantom; + delete window.callPhantom; + }}); + }}; + var saveAndExit = function() {{ + fs.write("{html}", page.content, write); + phantom.exit(); + }}; + page.onLoadFinished = function(status) {{ + if(page.url === "") {{ + page.setContent(fs.read("{html}", read), "{url}"); + }} + else {{ + {jscode} + }} + }}; + page.open(""); + ''' + + _TMP_FILE_NAMES = ['script', 'html'] + + def __init__(self, extractor, timeout=10000): + self.exe = check_executable('phantomjs', ['-v']) + if not self.exe: + raise ExtractorError('PhantomJS executable not found in PATH, ' + 'download it from http://phantomjs.org', + expected=True) + self.extractor = extractor + self.options = { + 'timeout': timeout, + } + self._TMP_FILES = {} + for name in self._TMP_FILE_NAMES: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.close() + self._TMP_FILES[name] = tmp + + def __del__(self): + for name in self._TMP_FILE_NAMES: + try: + os.remove(self._TMP_FILES[name].name) + except: + pass + + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): + """ + Downloads webpage (if needed) and executes JS + + Params: + url: website url + html: optional, html code of website + video_id: video id + note: optional, displayed when downloading webpage + note2: optional, displayed when executing JS + headers: custom http headers + jscode: code to be executed when page is loaded + + Returns tuple with: + * downloaded website (after JS execution) + * anything you print with `console.log` (but not inside `page.execute`!) + + In most cases you don't need to add any `jscode`. + It is executed in `page.onLoadFinished`. + `saveAndExit();` is mandatory, use it instead of `phantom.exit()` + It is possible to wait for some element on the webpage, for example: + var check = function() { + var elementFound = page.evaluate(function() { + return document.querySelector('#b.done') !== null; + }); + if(elementFound) + saveAndExit(); + else + window.setTimeout(check, 500); + } + + page.evaluate(function(){ + document.querySelector('#a').click(); + }); + check(); + """ + if 'saveAndExit();' not in jscode: + raise ExtractorError('`saveAndExit();` not found in `jscode`') + if not html: + html = self.extractor._download_webpage(url, video_id, note=note, headers=headers) + with open(self._TMP_FILES['html'].name, 'wb') as f: + f.write(html.encode('utf-8')) + + replaces = self.options + replaces['url'] = url + user_agent = headers.get('User-Agent') or std_headers['User-Agent'] + replaces['ua'] = user_agent.replace('"', '\\"') + replaces['jscode'] = jscode + + for x in self._TMP_FILE_NAMES: + replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + + with open(self._TMP_FILES['script'].name, 'wb') as f: + f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) + + if video_id is None: + self.extractor.to_screen('%s' % (note2,)) + else: + self.extractor.to_screen('%s: %s' % (video_id, note2)) + + p = subprocess.Popen([self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + if p.returncode != 0: + raise ExtractorError('Executing JS failed\n:' + + encodeArgument(err)) + with open(self._TMP_FILES['html'].name, 'rb') as f: + html = f.read().decode('utf-8') + return (html, encodeArgument(out)) + -- cgit v1.1 From 40e41780f1d770a355f01e3c1e6fb09ff392f97e Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Tue, 25 Apr 2017 15:12:54 +0200 Subject: [phantomjs] add cookie support --- youtube_dl/extractor/common.py | 8 ++++-- youtube_dl/utils.py | 62 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dcc9d62..e54adc9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2343,10 +2343,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, not port is None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 94e1b07..9c94b7e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3654,6 +3654,37 @@ def write_xattr(path, key, value): "or the 'xattr' binary.") +def cookie_to_dict(cookie): + cookie_dict = { + 'name': cookie.name, + 'value': cookie.value, + }; + if cookie.port_specified: + cookie_dict['port'] = cookie.port + if cookie.domain_specified: + cookie_dict['domain'] = cookie.domain + if cookie.path_specified: + cookie_dict['path'] = cookie.path + if not cookie.expires is None: + cookie_dict['expires'] = cookie.expires + if not cookie.secure is None: + cookie_dict['secure'] = cookie.secure + if not cookie.discard is None: + cookie_dict['discard'] = cookie.discard + try: + if (cookie.has_nonstandard_attr('httpOnly') or + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): + cookie_dict['httponly'] = True + except TypeError: + pass + return cookie_dict + + +def cookie_jar_to_list(cookie_jar): + return [cookie_to_dict(cookie) for cookie in cookie_jar] + + class PhantomJSwrapper(object): """PhantomJS wrapper class""" @@ -3674,6 +3705,9 @@ class PhantomJSwrapper(object): var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; var write = {{ mode: 'w', charset: 'utf-8' }}; + JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{ + phantom.addCookie(x); + }}); page.settings.resourceTimeout = {timeout}; page.settings.userAgent = "{ua}"; page.onLoadStarted = function() {{ @@ -3684,6 +3718,7 @@ class PhantomJSwrapper(object): }}; var saveAndExit = function() {{ fs.write("{html}", page.content, write); + fs.write("{cookies}", JSON.stringify(phantom.cookies), write); phantom.exit(); }}; page.onLoadFinished = function(status) {{ @@ -3697,7 +3732,7 @@ class PhantomJSwrapper(object): page.open(""); ''' - _TMP_FILE_NAMES = ['script', 'html'] + _TMP_FILE_NAMES = ['script', 'html', 'cookies'] def __init__(self, extractor, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) @@ -3722,6 +3757,26 @@ class PhantomJSwrapper(object): except: pass + def _save_cookies(self, url): + cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + for cookie in cookies: + if 'path' not in cookie: + cookie['path'] = '/' + if 'domain' not in cookie: + cookie['domain'] = compat_urlparse.urlparse(url).netloc + with open(self._TMP_FILES['cookies'].name, 'wb') as f: + f.write(json.dumps(cookies).encode('utf-8')) + + def _load_cookies(self): + with open(self._TMP_FILES['cookies'].name, 'rb') as f: + cookies = json.loads(f.read().decode('utf-8')) + for cookie in cookies: + if cookie['httponly'] is True: + cookie['rest'] = { 'httpOnly': None } + if 'expiry' in cookie: + cookie['expire_time'] = cookie['expiry'] + self.extractor._set_cookie(**cookie) + def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ Downloads webpage (if needed) and executes JS @@ -3765,6 +3820,8 @@ class PhantomJSwrapper(object): with open(self._TMP_FILES['html'].name, 'wb') as f: f.write(html.encode('utf-8')) + self._save_cookies(url) + replaces = self.options replaces['url'] = url user_agent = headers.get('User-Agent') or std_headers['User-Agent'] @@ -3791,5 +3848,8 @@ class PhantomJSwrapper(object): + encodeArgument(err)) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') + + self._load_cookies() + return (html, encodeArgument(out)) -- cgit v1.1 From fcace2d1adac5d1f306b22219fde3a4542bcd719 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 10:30:45 +0200 Subject: [openload] raise `not found` before executing js --- youtube_dl/extractor/openload.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index ac5e0bb..0adf177 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -74,12 +74,14 @@ class OpenloadIE(InfoExtractor): 'User-Agent': self._USER_AGENT, } - phantom = PhantomJSwrapper(self) - webpage, _ = phantom.get(url, video_id=video_id, headers=headers) + webpage = self._download_webpage(url, video_id, headers=headers) if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) + phantom = PhantomJSwrapper(self) + webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) + decoded_id = get_element_by_id('streamurl', webpage) video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id -- cgit v1.1 From 98f9d873814da2a8584cc30c0e197c15ed249db3 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 12:41:42 +0200 Subject: [phantomjs] Add required version checking --- youtube_dl/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9c94b7e..84aaac6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3734,13 +3734,22 @@ class PhantomJSwrapper(object): _TMP_FILE_NAMES = ['script', 'html', 'cookies'] - def __init__(self, extractor, timeout=10000): + def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: raise ExtractorError('PhantomJS executable not found in PATH, ' 'download it from http://phantomjs.org', expected=True) + self.extractor = extractor + + if required_version: + version = get_exe_version(self.exe, version_re=r'([0-9.]+)') + if is_outdated_version(version, required_version): + self.extractor._downloader.report_warning( + 'Your copy of PhantomJS is outdated, update it to version ' + '%s or newer if you encounter any errors.' % required_version) + self.options = { 'timeout': timeout, } -- cgit v1.1 From 7552f96352f35cd877e52fd0770b77ba1856fc62 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Sat, 29 Apr 2017 12:41:57 +0200 Subject: [openload] Add required version --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 0adf177..292476e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -79,7 +79,7 @@ class OpenloadIE(InfoExtractor): if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True, video_id=video_id) - phantom = PhantomJSwrapper(self) + phantom = PhantomJSwrapper(self, required_version='2.0') webpage, _ = phantom.get(url, html=webpage, video_id=video_id, headers=headers) decoded_id = get_element_by_id('streamurl', webpage) -- cgit v1.1 From 5ff1bc0cc10bf3006834c4b49fc36d733c83ce5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 28 Apr 2017 22:25:20 +0100 Subject: [YoutubeDL] write raw subtitle files --- youtube_dl/YoutubeDL.py | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c4..c7100bb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1696,29 +1696,30 @@ class YoutubeDL(object): ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] - if sub_info.get('data') is not None: - sub_data = sub_info['data'] + sub_filename = subtitles_filename(filename, sub_lang, sub_format) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format)) else: - try: - sub_data = ie._download_webpage( - sub_info['url'], info_dict['id'], note=False) - except ExtractorError as err: - self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, error_to_compat_str(err.cause))) - continue - try: - sub_filename = subtitles_filename(filename, sub_lang, sub_format) - if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): - self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + if sub_info.get('data') is not None: + try: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: + subfile.write(sub_info['data']) + except (OSError, IOError): + self.report_error('Cannot write subtitles file ' + sub_filename) + return else: - self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: - subfile.write(sub_data) - except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + sub_filename) - return + try: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) + except (ExtractorError, IOError, OSError, ValueError) as err: + self.report_warning('Unable to download subtitle for "%s": %s' % + (sub_lang, error_to_compat_str(err))) + continue if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) -- cgit v1.1 From feee8d32e45c9521426cf4a089c70f37542f0065 Mon Sep 17 00:00:00 2001 From: Tithen-Firion Date: Thu, 3 Aug 2017 14:17:25 +0200 Subject: [phantomjs] add exe version to debug info --- youtube_dl/YoutubeDL.py | 2 ++ youtube_dl/utils.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb465c4..033b507 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -86,6 +86,7 @@ from .utils import ( write_string, YoutubeDLCookieProcessor, YoutubeDLHandler, + PhantomJSwrapper, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER @@ -2146,6 +2147,7 @@ class YoutubeDL(object): exe_versions = FFmpegPostProcessor.get_versions(self) exe_versions['rtmpdump'] = rtmpdump_version() + exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c67f95a..4d0685d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3895,6 +3895,10 @@ class PhantomJSwrapper(object): _TMP_FILE_NAMES = ['script', 'html', 'cookies'] + @staticmethod + def _version(): + return get_exe_version('phantomjs', version_re=r'([0-9.]+)') + def __init__(self, extractor, required_version=None, timeout=10000): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: @@ -3905,7 +3909,7 @@ class PhantomJSwrapper(object): self.extractor = extractor if required_version: - version = get_exe_version(self.exe, version_re=r'([0-9.]+)') + version = self._version() if is_outdated_version(version, required_version): self.extractor._downloader.report_warning( 'Your copy of PhantomJS is outdated, update it to version ' -- cgit v1.1 From e7c3e33456e155106ad08347d8ab9a2ecd0c8a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Sep 2017 23:19:53 +0700 Subject: [downloader/fragment] Restart inconsistent incomplete fragment downloads (#13731) --- youtube_dl/downloader/fragment.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ec..6f6fb4a 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -151,10 +151,15 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) + if ctx['fragment_index'] > 0 and resume_len == 0: + self.report_error( + 'Inconsistent state of incomplete fragment download. ' + 'Restarting from the beginning...') + ctx['fragment_index'] = resume_len = 0 + self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) - if ctx['fragment_index'] > 0: - assert resume_len > 0 + assert ctx['fragment_index'] == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) -- cgit v1.1 From 319fc70676fea19b71437aab4078d4d72cc1ba5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 Sep 2017 23:50:19 +0700 Subject: [tv4] Relax _VALID_URL (closes #14206) --- youtube_dl/extractor/tv4.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 7aeb2c6..6487039 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -18,7 +18,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^\?]+)\?video_id=| + (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -63,6 +63,10 @@ class TV4IE(InfoExtractor): 'url': 'http://www.tv4play.se/barn/looney-tunes?video_id=3062412', 'only_matching': True, }, + { + 'url': ' http://www.tv4play.se/program/farang/3922081', + 'only_matching': True, + } ] def _real_extract(self, url): -- cgit v1.1 From 0732a90579091ad60b124fd693bdda8ee526e305 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 14 Sep 2017 20:37:46 +0200 Subject: [orf] Add new extractor for f4m stories --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/orf.py | 114 +++++++++++++++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aefadc5..a3a97e9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -768,6 +768,7 @@ from .ora import OraTVIE from .orf import ( ORFTVthekIE, ORFFM4IE, + ORFFM4StoryIE, ORFOE1IE, ORFIPTVIE, ) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index cc296ea..74fe801 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -6,14 +6,15 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + determine_ext, + float_or_none, HEADRequest, - unified_strdate, - strip_jsonp, int_or_none, - float_or_none, - determine_ext, + orderedSet, remove_end, + strip_jsonp, unescapeHTML, + unified_strdate, ) @@ -307,3 +308,108 @@ class ORFIPTVIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, } + + +class ORFFM4StoryIE(InfoExtractor): + IE_NAME = 'orf:fm4:story' + IE_DESC = 'fm4.orf.at stories' + _VALID_URL = r'https?://fm4\.orf\.at/stories/(?P\d+)' + + _TEST = { + 'url': 'http://fm4.orf.at/stories/2865738/', + 'playlist': [{ + 'md5': 'e1c2c706c45c7b34cf478bbf409907ca', + 'info_dict': { + 'id': '547792', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + 'duration': 1748.52, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + }, + }, { + 'md5': 'c6dd2179731f86f4f55a7b49899d515f', + 'info_dict': { + 'id': '547798', + 'ext': 'flv', + 'title': 'Manu Delago und Inner Tongue live (2)', + 'duration': 1504.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170913', + 'description': 'Manu Delago und Inner Tongue haben bei der FM4 Soundpark Session live alles gegeben. Hier gibt es Fotos und die gesamte Session als Video.', + }, + }], + } + + def _real_extract(self, url): + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + + entries = [] + all_ids = orderedSet(re.findall(r'data-video(?:id)?="(\d+)"', webpage)) + for idx, video_id in enumerate(all_ids): + data = self._download_json( + 'http://bits.orf.at/filehandler/static-api/json/current/data.json?file=%s' % video_id, + video_id)[0] + + duration = float_or_none(data['duration'], 1000) + + video = data['sources']['q8c'] + load_balancer_url = video['loadBalancerUrl'] + abr = int_or_none(video.get('audioBitrate')) + vbr = int_or_none(video.get('bitrate')) + fps = int_or_none(video.get('videoFps')) + width = int_or_none(video.get('videoWidth')) + height = int_or_none(video.get('videoHeight')) + thumbnail = video.get('preview') + + rendition = self._download_json( + load_balancer_url, video_id, transform_source=strip_jsonp) + + f = { + 'abr': abr, + 'vbr': vbr, + 'fps': fps, + 'width': width, + 'height': height, + } + + formats = [] + for format_id, format_url in rendition['redirect'].items(): + if format_id == 'rtmp': + ff = f.copy() + ff.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(ff) + elif determine_ext(format_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url, video_id, f4m_id=format_id)) + elif determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id=format_id)) + else: + continue + self._sort_formats(formats) + + title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') + if idx >= 1: + # Titles are duplicates, make them unique + title += ' (' + str(idx + 1) + ')' + description = self._og_search_description(webpage) + upload_date = unified_strdate(self._html_search_meta( + 'dc.date', webpage, 'upload date')) + + entries.append({ + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + + return self.playlist_result(entries) -- cgit v1.1 From fad9fc537d8ab5141f402f6ccdf60161a8d0302a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 14 Sep 2017 20:47:23 +0200 Subject: [tv4] fix a test URL --- youtube_dl/extractor/tv4.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 6487039..cfcce02 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -64,7 +64,7 @@ class TV4IE(InfoExtractor): 'only_matching': True, }, { - 'url': ' http://www.tv4play.se/program/farang/3922081', + 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, } ] -- cgit v1.1 From c46680fb2a0ee61faa25863d3c10b7b098cdbe67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Sep 2017 01:59:47 +0700 Subject: [condenast] Fix extraction (closes #14196, closes #14207) --- youtube_dl/extractor/condenast.py | 54 +++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 0c3f0c0..ed278fe 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -116,16 +116,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video_params(self, webpage): - query = {} - params = self._search_regex( - r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) - if params: - query.update({ - 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), - 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), - 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), - }) + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) else: params = extract_attributes(self._search_regex( r'(<[^>]+data-js="video-player"[^>]+>)', @@ -141,17 +141,27 @@ class CondeNastIE(InfoExtractor): video_id = params['videoId'] video_info = None - if params.get('playerId'): - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=params) - if info_page: - video_info = info_page.get('video') - if not video_info: - info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=params) - else: + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: info_page = self._download_webpage( 'https://player.cnevids.com/inline/video/%s.js' % video_id, video_id, 'Downloading inline info', query={ @@ -215,7 +225,7 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage) + params = self._extract_video_params(webpage, display_id) info = self._search_json_ld( webpage, display_id, fatal=False) info.update(self._extract_video(params)) -- cgit v1.1 From 86e55e317cb70f07792cfe543186ad520cbe3230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Sep 2017 21:45:18 +0700 Subject: [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index c286da6..38d5113 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + version 2017.09.11 Extractors -- cgit v1.1 From 159d304a9fa2b91d91a60fe3bdf2211a59bcf346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Sep 2017 21:48:06 +0700 Subject: release 2017.09.15 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f40cb2c..98ab5b6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.15*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.15** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.09.11 +[debug] youtube-dl version 2017.09.15 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 38d5113..041dfd7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.09.15 Core * [downloader/fragment] Restart inconsistent incomplete fragment downloads diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 798a81d..6b01dc9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -593,6 +593,7 @@ - **Openload** - **OraTV** - **orf:fm4**: radio FM4 + - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cdcb32e..8399c04 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.09.11' +__version__ = '2017.09.15' -- cgit v1.1 From cbf85239bbb835162725cd4c8758831ca1003445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Sep 2017 22:13:30 +0700 Subject: [vgtv] Relax _VALID_URL (closes #14223) --- youtube_dl/extractor/vgtv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 0f8c156..c21a09c 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -42,7 +42,7 @@ class VGTVIE(XstreamIE): ) /? (?: - \#!/(?:video|live)/| + (?:\#!/)?(?:video|live)/| embed?.*id=| articles/ )| @@ -146,7 +146,11 @@ class VGTVIE(XstreamIE): { 'url': 'abtv:140026', 'only_matching': True, - } + }, + { + 'url': 'http://www.vgtv.no/video/84196/hevnen-er-soet-episode-10-abu', + 'only_matching': True, + }, ] def _real_extract(self, url): -- cgit v1.1 From b763e1d68c6becc414a802a452f5aa819c5de920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Sep 2017 22:18:38 +0700 Subject: [twitch] Add support for go.twitch.tv URLs (closes #14215) --- youtube_dl/extractor/twitch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2daf9df..c926c99 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -28,7 +28,7 @@ from ..utils import ( class TwitchBaseIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' + _VALID_URL_BASE = r'https?://(?:(?:www|go)\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' @@ -217,7 +217,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| + (?:(?:www|go)\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P\d+) @@ -458,7 +458,7 @@ class TwitchStreamIE(TwitchBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/| + (?:(?:www|go)\.)?twitch\.tv/| player\.twitch\.tv/\?.*?\bchannel= ) (?P[^/#?]+) @@ -489,6 +489,9 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?channel=lotsofs', 'only_matching': True, + }, { + 'url': 'https://go.twitch.tv/food', + 'only_matching': True, }] @classmethod -- cgit v1.1 From 6be44a50edfe2e75e31553e7a128ce1849301958 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 Sep 2017 22:25:38 +0700 Subject: [dailymotion:playlist] Relax _VALID_URL (closes #14219) --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 74e9913..e9d0dd19 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -325,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P[^/?#&]+)' _MORE_PAGES_INDICATOR = r'(?s)
.*? Date: Fri, 15 Sep 2017 23:12:19 +0700 Subject: [noovo] Fix extraction (closes #14214) --- youtube_dl/extractor/noovo.py | 61 +++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index f7fa098..974de3c 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + js_to_json, smuggle_url, try_get, ) @@ -24,8 +25,6 @@ class NoovoIE(InfoExtractor): 'timestamp': 1491399228, 'upload_date': '20170405', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': 'RPM+', }, 'params': { @@ -37,13 +36,11 @@ class NoovoIE(InfoExtractor): 'info_dict': { 'id': '5395865725001', 'title': 'Épisode 13 : Les retrouvailles', - 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'description': 'md5:888c3330f0c1b4476c5bc99a1c040473', 'ext': 'mp4', 'timestamp': 1492019320, 'upload_date': '20170412', 'uploader_id': '618566855001', - 'creator': 'vtele', - 'view_count': int, 'series': "L'amour est dans le pré", 'season_number': 5, 'episode': 'Épisode 13', @@ -58,40 +55,46 @@ class NoovoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, - video_id)['data'] + webpage = self._download_webpage(url, video_id) - content = try_get(data, lambda x: x['contents'][0]) + bc_url = BrightcoveNewIE._extract_url(self, webpage) - brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + data = self._parse_json( + self._search_regex( + r'(?s)dataLayer\.push\(\s*({.+?})\s*\);', webpage, 'data', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + + title = try_get( + data, lambda x: x['video']['nom'], + compat_str) or self._html_search_meta( + 'dcterms.Title', webpage, 'title', fatal=True) + + description = self._html_search_meta( + ('dcterms.Description', 'description'), webpage, 'description') series = try_get( - data, ( - lambda x: x['show']['title'], - lambda x: x['season']['show']['title']), - compat_str) + data, lambda x: x['emission']['nom']) or self._search_regex( + r']+class="banner-card__subtitle h4"[^>]*>([^<]+)', + webpage, 'series', default=None) - episode = None - og = data.get('og') - if isinstance(og, dict) and og.get('type') == 'video.episode': - episode = og.get('title') + season_el = try_get(data, lambda x: x['emission']['saison'], dict) or {} + season = try_get(season_el, lambda x: x['nom'], compat_str) + season_number = int_or_none(try_get(season_el, lambda x: x['numero'])) - video = content or data + episode_el = try_get(season_el, lambda x: x['episode'], dict) or {} + episode = try_get(episode_el, lambda x: x['nom'], compat_str) + episode_number = int_or_none(try_get(episode_el, lambda x: x['numero'])) return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['CA']}), - 'id': brightcove_id, - 'title': video.get('title'), - 'creator': video.get('source'), - 'view_count': int_or_none(video.get('viewsCount')), + 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'title': title, + 'description': description, 'series': series, - 'season_number': int_or_none(try_get( - data, lambda x: x['season']['seasonNumber'])), + 'season': season, + 'season_number': season_number, 'episode': episode, - 'episode_number': int_or_none(data.get('episodeNumber')), + 'episode_number': episode_number, } -- cgit v1.1 From 68d43a61b552007a718894967b869c0f1d8ff00f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 16 Sep 2017 12:14:48 +0800 Subject: Ignore TTML subtitles --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index a5b585f..fbf7cec 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,7 @@ cover/ updates_key.pem *.egg-info *.srt +*.ttml *.sbv *.vtt *.flv -- cgit v1.1 From 3869028ffb6be6ab719e5cf1004276dfdfd1216d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 16 Sep 2017 12:18:38 +0800 Subject: [utils] Use bytes-like objects in dfxp2srt This fixes handling of non-UTF8 TTML subtitles Closes #14191 --- ChangeLog | 6 ++++++ test/test_utils.py | 26 +++++++++++++++++++++++--- youtube_dl/postprocessor/ffmpeg.py | 2 +- youtube_dl/utils.py | 18 +++++++++++------- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 041dfd7..ba9260e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Core +* [utils] Fix handling raw TTML subtitles (#14191) + + version 2017.09.15 Core diff --git a/test/test_utils.py b/test/test_utils.py index e50f376..efa73d0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1064,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')

Ignored, three

- ''' + '''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The following line contains Chinese characters and special symbols @@ -1089,7 +1089,7 @@ Line

The first line

- ''' + '''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The first line @@ -1115,7 +1115,7 @@ The first line

inner
style

-''' +'''.encode('utf-8') srt_data = '''1 00:00:02,080 --> 00:00:05,839 default stylecustom style @@ -1138,6 +1138,26 @@ part 3 ''' self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) + dfxp_data_non_utf8 = ''' + + +
+

Line 1

+

第二行

+
+ +
'''.encode('utf-16') + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +Line 1 + +2 +00:00:01,000 --> 00:00:02,000 +第二行 + +''' + self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data) + def test_cli_option(self): self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 51256a3..f71d413 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): dfxp_file = old_file srt_file = subtitles_filename(filename, lang, 'srt') - with io.open(dfxp_file, 'rt', encoding='utf-8') as f: + with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) with io.open(srt_file, 'wt', encoding='utf-8') as f: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9e4492d..b724e0b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' LEGACY_NAMESPACES = ( - ('http://www.w3.org/ns/ttml', [ - 'http://www.w3.org/2004/11/ttaf1', - 'http://www.w3.org/2006/04/ttaf1', - 'http://www.w3.org/2006/10/ttaf1', + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', ]), - ('http://www.w3.org/ns/ttml#styling', [ - 'http://www.w3.org/ns/ttml#style', + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', ]), ) @@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data): for ns in v: dfxp_data = dfxp_data.replace(ns, k) - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') -- cgit v1.1 From 790d379e4df2f85ece7cab02e805643234bb5c16 Mon Sep 17 00:00:00 2001 From: Windom Date: Sat, 16 Sep 2017 18:39:46 +0300 Subject: [morningstar] Relax _VALID_URL --- youtube_dl/extractor/morningstar.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py index 320d27b..0093bcd 100644 --- a/youtube_dl/extractor/morningstar.py +++ b/youtube_dl/extractor/morningstar.py @@ -8,8 +8,8 @@ from .common import InfoExtractor class MorningstarIE(InfoExtractor): IE_DESC = 'morningstar.com' - _VALID_URL = r'https?://(?:www\.)?morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www|news)\.)morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P[0-9]+)' + _TESTS = [{ 'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869', 'md5': '6c0acface7a787aadc8391e4bbf7b0f5', 'info_dict': { @@ -19,7 +19,10 @@ class MorningstarIE(InfoExtractor): 'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.", 'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$' } - } + }, { + 'url': 'http://news.morningstar.com/cover/videocenter.aspx?id=825556', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) -- cgit v1.1 From 8251af63a12cd73cf2578c81dbb869232da2592c Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Sat, 16 Sep 2017 21:15:23 +0530 Subject: [viki] Update app data (closes #14181) --- youtube_dl/extractor/viki.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index e9c8bf8..853e5c7 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -23,9 +23,9 @@ class VikiBaseIE(InfoExtractor): _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' - _APP = '65535a' + _APP = '100005a' _APP_VERSION = '2.2.5.1428709186' - _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False _NETRC_MACHINE = 'viki' -- cgit v1.1 From 4ed2d7b7d1f67e499a46e507d957616e364565ca Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 17 Sep 2017 13:53:04 +0800 Subject: Fix flake8 issues after #14225 --- youtube_dl/extractor/common.py | 2 +- youtube_dl/utils.py | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 317a9a7..2bbbf8f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2452,7 +2452,7 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, port, not port is None, domain, True, + 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b724e0b..acc4f98 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3830,23 +3830,23 @@ def cookie_to_dict(cookie): cookie_dict = { 'name': cookie.name, 'value': cookie.value, - }; + } if cookie.port_specified: cookie_dict['port'] = cookie.port if cookie.domain_specified: cookie_dict['domain'] = cookie.domain if cookie.path_specified: cookie_dict['path'] = cookie.path - if not cookie.expires is None: + if cookie.expires is not None: cookie_dict['expires'] = cookie.expires - if not cookie.secure is None: + if cookie.secure is not None: cookie_dict['secure'] = cookie.secure - if not cookie.discard is None: + if cookie.discard is not None: cookie_dict['discard'] = cookie.discard try: if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): + cookie.has_nonstandard_attr('httponly') or + cookie.has_nonstandard_attr('HttpOnly')): cookie_dict['httponly'] = True except TypeError: pass @@ -3957,7 +3957,7 @@ class PhantomJSwrapper(object): cookies = json.loads(f.read().decode('utf-8')) for cookie in cookies: if cookie['httponly'] is True: - cookie['rest'] = { 'httpOnly': None } + cookie['rest'] = {'httpOnly': None} if 'expiry' in cookie: cookie['expire_time'] = cookie['expiry'] self.extractor._set_cookie(**cookie) @@ -3965,7 +3965,7 @@ class PhantomJSwrapper(object): def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ Downloads webpage (if needed) and executes JS - + Params: url: website url html: optional, html code of website @@ -3974,11 +3974,11 @@ class PhantomJSwrapper(object): note2: optional, displayed when executing JS headers: custom http headers jscode: code to be executed when page is loaded - + Returns tuple with: * downloaded website (after JS execution) * anything you print with `console.log` (but not inside `page.execute`!) - + In most cases you don't need to add any `jscode`. It is executed in `page.onLoadFinished`. `saveAndExit();` is mandatory, use it instead of `phantom.exit()` @@ -3992,7 +3992,7 @@ class PhantomJSwrapper(object): else window.setTimeout(check, 500); } - + page.evaluate(function(){ document.querySelector('#a').click(); }); @@ -4024,13 +4024,14 @@ class PhantomJSwrapper(object): else: self.extractor.to_screen('%s: %s' % (video_id, note2)) - p = subprocess.Popen([self.exe, '--ssl-protocol=any', - self._TMP_FILES['script'].name], stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + p = subprocess.Popen([ + self.exe, '--ssl-protocol=any', + self._TMP_FILES['script'].name + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() if p.returncode != 0: - raise ExtractorError('Executing JS failed\n:' - + encodeArgument(err)) + raise ExtractorError( + 'Executing JS failed\n:' + encodeArgument(err)) with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') -- cgit v1.1 From 9c2a17f2ce7b2b9dc45b603be413a943f6637498 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 Sep 2017 22:19:57 +0700 Subject: [popcorntv] Add extractor (closes #5914, closes #14211) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/popcorntv.py | 78 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/popcorntv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a3a97e9..ab95c85 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -808,6 +808,7 @@ from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, ) +from .popcorntv import PopcornTVIE from .porn91 import Porn91IE from .porncom import PornComIE from .pornflip import PornFlipIE diff --git a/youtube_dl/extractor/popcorntv.py b/youtube_dl/extractor/popcorntv.py new file mode 100644 index 0000000..ac901f4 --- /dev/null +++ b/youtube_dl/extractor/popcorntv.py @@ -0,0 +1,78 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + unified_timestamp, +) + + +class PopcornTVIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.popcorntv\.it/guarda/(?P[^/]+)/(?P\d+)' + _TESTS = [{ + 'url': 'https://animemanga.popcorntv.it/guarda/food-wars-battaglie-culinarie-episodio-01/9183', + 'md5': '47d65a48d147caf692ab8562fe630b45', + 'info_dict': { + 'id': '9183', + 'display_id': 'food-wars-battaglie-culinarie-episodio-01', + 'ext': 'mp4', + 'title': 'Food Wars, Battaglie Culinarie | Episodio 01', + 'description': 'md5:b8bea378faae4651d3b34c6e112463d0', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1497610857, + 'upload_date': '20170616', + 'duration': 1440, + 'view_count': int, + }, + }, { + 'url': 'https://cinema.popcorntv.it/guarda/smash-cut/10433', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + m3u8_url = extract_attributes( + self._search_regex( + r'(]+itemprop=["\'](?:content|embed)Url[^>]*>)', + webpage, 'content' + ))['href'] + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + title = self._search_regex( + r']+itemprop=["\']name[^>]*>([^<]+)', webpage, + 'title', default=None) or self._og_search_title(webpage) + + description = self._html_search_regex( + r'(?s)]+itemprop=["\']description[^>]*>(.+?)', + webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + print(self._html_search_meta( + 'duration', webpage)) + duration = int_or_none(self._html_search_meta( + 'duration', webpage), invscale=60) + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } -- cgit v1.1 From 4d8c4b46d5668387cc685123f80fb64bbc7c5aff Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Sun, 17 Sep 2017 15:46:52 +0000 Subject: [heise] Add support for YouTube embeds --- youtube_dl/extractor/heise.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 382f327..495ffb7 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, @@ -26,6 +27,22 @@ class HeiseIE(InfoExtractor): 'thumbnail': r're:^https?://.*/gallery/$', } }, { + # YouTube embed + 'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', + 'md5': 'e403d2b43fea8e405e88e3f8623909f1', + 'info_dict': { + 'id': '6kmWbXleKW4', + 'ext': 'mp4', + 'title': 'NEU IM SEPTEMBER | Netflix', + 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'upload_date': '20170830', + 'uploader': 'Netflix Deutschland, Österreich und Schweiz', + 'uploader_id': 'netflixdach', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, }, { @@ -40,6 +57,16 @@ class HeiseIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('fulltitle', webpage, default=None) + if not title or title == "c't": + title = self._search_regex( + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') + + yt_urls = YoutubeIE._extract_urls(webpage) + if yt_urls: + return self.playlist_from_matches(yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + container_id = self._search_regex( r'
]+data-container="([0-9]+)"', webpage, 'container ID') @@ -47,12 +74,6 @@ class HeiseIE(InfoExtractor): r'
]+data-sequenz="([0-9]+)"', webpage, 'sequenz ID') - title = self._html_search_meta('fulltitle', webpage, default=None) - if not title or title == "c't": - title = self._search_regex( - r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') - doc = self._download_xml( 'http://www.heise.de/videout/feed', video_id, query={ 'container': container_id, -- cgit v1.1 From 8a1a60d17397721620e75d83f2aad3a353286f15 Mon Sep 17 00:00:00 2001 From: Kareem Moussa Date: Tue, 19 Sep 2017 08:51:20 -0700 Subject: [devscripts/check-porn] Fix gettestcases import --- devscripts/check-porn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 7a219eb..72b2ee4 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -14,7 +14,7 @@ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_testcases +from test.helper import gettestcases from youtube_dl.utils import compat_urllib_parse_urlparse from youtube_dl.utils import compat_urllib_request @@ -24,7 +24,7 @@ if len(sys.argv) > 1: else: METHOD = 'EURISTIC' -for test in get_testcases(): +for test in gettestcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() -- cgit v1.1 From dc76eef092dc10f7e3f599fa7d85a04de8d84b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 Sep 2017 23:59:36 +0700 Subject: [tvplay] Bypass geo restriction --- youtube_dl/extractor/tvplay.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 99ff82a..46132ed 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -15,7 +15,9 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + smuggle_url, try_get, + unsmuggle_url, update_url_query, ) @@ -224,6 +226,9 @@ class TVPlayIE(InfoExtractor): ] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + video_id = self._match_id(url) geo_country = self._search_regex( r'https?://[^/]+\.([a-z]{2})', url, @@ -426,4 +431,9 @@ class ViafreeIE(InfoExtractor): r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', webpage, 'video id') - return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) + return self.url_result( + smuggle_url( + 'mtg:%s' % video_id, + {'geo_countries': [ + compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]]}), + ie=TVPlayIE.ie_key(), video_id=video_id) -- cgit v1.1 From 3b65a6fbf31256030ff35210a7be2b50369a6c4f Mon Sep 17 00:00:00 2001 From: capital-G Date: Tue, 19 Sep 2017 22:58:06 +0200 Subject: [twitter] Fix duration extraction --- youtube_dl/extractor/twitter.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6eaf360..7399cf5 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -229,7 +229,7 @@ class TwitterCardIE(TwitterBaseIE): title = self._search_regex(r'([^<]+)', webpage, 'title') thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration')) or duration + duration = float_or_none(config.get('duration'), scale=1000) or duration return { 'id': video_id, @@ -255,6 +255,7 @@ class TwitterIE(InfoExtractor): 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', + 'duration': 12.922, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -305,11 +306,12 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'あかさ - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'あかさ on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'Donte', + 'uploader': 'あかさ', 'uploader_id': 'jaydingeer', + 'duration': 30.0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -337,6 +339,7 @@ class TwitterIE(InfoExtractor): 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', 'uploader_id': 'captainamerica', 'uploader': 'Captain America', + 'duration': 3.17, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -364,6 +367,7 @@ class TwitterIE(InfoExtractor): 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', + 'duration': 277.4, }, 'params': { 'format': 'best[format_id^=http-]', -- cgit v1.1 From 12ea5c79fb0bfa878d62d130cf67057fc230dfa7 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Wed, 20 Sep 2017 14:53:06 -0500 Subject: [nbcsports:vplayer] Correct theplatform URL (closes #13873) --- youtube_dl/extractor/nbc.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 62db70b..836a41f 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -109,10 +109,10 @@ class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ - 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', 'info_dict': { 'id': '9CsDKds0kvHI', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', 'timestamp': 1426270238, @@ -120,7 +120,7 @@ class NBCSportsVPlayerIE(InfoExtractor): 'uploader': 'NBCU-SPORTS', } }, { - 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', + 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, }] @@ -134,7 +134,8 @@ class NBCSportsVPlayerIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - theplatform_url = self._og_search_video_url(webpage) + theplatform_url = self._og_search_video_url(webpage).replace( + 'vplayer.nbcsports.com', 'player.theplatform.com') return self.url_result(theplatform_url, 'ThePlatform') -- cgit v1.1 From f6ff52b473c9ed969fadb3e3d50852c4a27ba17e Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano Date: Wed, 20 Sep 2017 23:05:33 +0200 Subject: [beeg] Fix extraction (closes #14275) --- youtube_dl/extractor/beeg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index d5c5822..bbeae4b 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -9,6 +9,7 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + urljoin, ) @@ -36,9 +37,11 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) cpl_url = self._search_regex( - r']+src=(["\'])(?P(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + r']+src=(["\'])(?P(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', webpage, 'cpl', default=None, group='url') + cpl_url = urljoin(url, cpl_url) + beeg_version, beeg_salt = [None] * 2 if cpl_url: @@ -54,7 +57,7 @@ class BeegIE(InfoExtractor): r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg salt', default=None, group='beeg_salt') - beeg_version = beeg_version or '2000' + beeg_version = beeg_version or '2185' beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' video = self._download_json( -- cgit v1.1 From 8c6919e4331e1cd44f50e700e8fc4e630d913a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 Sep 2017 23:00:35 +0700 Subject: [lynda] Add support for educourse.ga (closes #14286) --- youtube_dl/extractor/lynda.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index d2f7529..1b6f509 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,7 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://(?:www\.)?lynda\.com/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:lynda\.com|educourse\.ga)/(?:[^/]+/[^/]+/(?P\d+)|player/embed)/(?P\d+)' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' @@ -110,6 +110,9 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://www.lynda.com/player/embed/133770?tr=foo=1;bar=g;fizz=rt&fs=0', 'only_matching': True, + }, { + 'url': 'https://educourse.ga/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -253,7 +256,7 @@ class LyndaCourseIE(LyndaBaseIE): # Course link equals to welcome/introduction video link of same course # We will recognize it as course link - _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' + _VALID_URL = r'https?://(?:www|m)\.(?:lynda\.com|educourse\.ga)/(?P[^/]+/[^/]+/(?P\d+))-\d\.html' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) -- cgit v1.1 From 8c2895305dc09920055611c8120f5a65fcd2614f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 23 Sep 2017 02:30:03 +0800 Subject: [options] Accept lrc as a subtitle conversion target format (closes #14292) --- ChangeLog | 1 + youtube_dl/__init__.py | 2 +- youtube_dl/options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index ba9260e..42ba879 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) * [utils] Fix handling raw TTML subtitles (#14191) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c458941..ba684a0 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -206,7 +206,7 @@ def _real_main(argv=None): if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: - if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: + if opts.convertsubtitles not in ['srt', 'vtt', 'ass', 'lrc']: parser.error('invalid subtitle format specified') if opts.date is not None: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 38439c9..4c04550 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -847,7 +847,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--convert-subs', '--convert-subtitles', metavar='FORMAT', dest='convertsubtitles', default=None, - help='Convert the subtitles to other format (currently supported: srt|ass|vtt)') + help='Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)') parser.add_option_group(general) parser.add_option_group(network) -- cgit v1.1 From 2384f5a64e501d7abb844e8d31fe340b34d8d4e7 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Wed, 6 Sep 2017 11:24:34 +0900 Subject: [mixcloud] Fix extraction (closes #14088) --- youtube_dl/compat.py | 10 ++- youtube_dl/extractor/mixcloud.py | 167 ++++++++++++++++++++++++--------------- 2 files changed, 114 insertions(+), 63 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e4e13b..2a62248 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -6,6 +6,7 @@ import collections import email import getpass import io +import itertools import optparse import os import re @@ -15,7 +16,6 @@ import socket import struct import subprocess import sys -import itertools import xml.etree.ElementTree @@ -2898,6 +2898,13 @@ else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip __all__ = [ 'compat_HTMLParseError', @@ -2948,5 +2955,6 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', + 'compat_zip', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f6360cc..4811823 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,16 +9,16 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, - compat_str, compat_urllib_parse_unquote, compat_urlparse, + compat_zip ) from ..utils import ( clean_html, ExtractorError, OnDemandPagedList, str_to_int, -) + try_get) class MixcloudIE(InfoExtractor): @@ -54,27 +54,19 @@ class MixcloudIE(InfoExtractor): 'only_matching': True, }] - _keys = [ - 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };', - 'pleasedontdownloadourmusictheartistswontgetpaid', - 'window.addEventListener = window.addEventListener || function() {};', - '(function() { return new Date().toLocaleDateString(); })()' - ] - _current_key = None - - # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js - def _decrypt_play_info(self, play_info, video_id): - play_info = base64.b64decode(play_info.encode('ascii')) - for num, key in enumerate(self._keys, start=1): - try: - return self._parse_json( - ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)])) - for idx, ch in enumerate(play_info)]), - video_id) - except ExtractorError: - if num == len(self._keys): - raise + @staticmethod + def _decrypt_xor_cipher(key, ciphertext): + """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) + + @staticmethod + def _decrypt_and_extend(stream_info, url_key, getter, key, formats): + maybe_url = stream_info.get(url_key) + if maybe_url is not None: + decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) + formats.extend(getter(decrypted)) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -84,54 +76,105 @@ class MixcloudIE(InfoExtractor): webpage = self._download_webpage(url, track_id) - if not self._current_key: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url: - js = self._download_webpage(js_url, track_id, fatal=False) - if js: - KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' - for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'): - key = self._search_regex( - KEY_RE_TEMPLATE % key_name, js, 'key', - default=None, group='key') - if key and isinstance(key, compat_str): - self._keys.insert(0, key) - self._current_key = key + # Legacy path + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info', default=None) + + if encrypted_play_info is not None: + # Decode + encrypted_play_info = base64.b64decode(encrypted_play_info) + else: + # New path + full_info_json = self._parse_json(self._html_search_regex( + r'', webpage, 'play info'), 'play info') + for item in full_info_json: + item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + if try_get(item_data, lambda x: x['streamInfo']['url']): + info_json = item_data + break + else: + raise ExtractorError('Failed to extract matching stream info') message = self._html_search_regex( r'(?s)]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - encrypted_play_info = self._search_regex( - r'm-play-info="([^"]+)"', webpage, 'play info') - - play_info = self._decrypt_play_info(encrypted_play_info, track_id) - - if message and 'stream_url' not in play_info: - raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - - song_url = play_info['stream_url'] - - title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') - thumbnail = self._proto_relative_url(self._html_search_regex( - r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) - uploader = self._html_search_regex( - r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) - uploader_id = self._search_regex( - r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) - description = self._og_search_description(webpage) - view_count = str_to_int(self._search_regex( - [r'([0-9,.]+)', - r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], - webpage, 'play count', default=None)) + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', + webpage, 'js url', default=None) + if js_url is None: + js_url = self._search_regex( + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id) + # Known plaintext attack + if encrypted_play_info: + kps = ['{"stream_url":'] + kpa_target = encrypted_play_info + else: + kps = ['https://', 'http://'] + kpa_target = base64.b64decode(info_json['streamInfo']['url']) + for kp in kps: + partial_key = self._decrypt_xor_cipher(kpa_target, kp) + for quote in ["'", '"']: + key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, + "encryption key", default=None) + if key is not None: + break + else: + continue + break + else: + raise ExtractorError('Failed to extract encryption key') + + if encrypted_play_info is not None: + play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info') + if message and 'stream_url' not in play_info: + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + song_url = play_info['stream_url'] + formats = [{ + 'format_id': 'normal', + 'url': song_url + }] + + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') + thumbnail = self._proto_relative_url(self._html_search_regex( + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) + uploader = self._html_search_regex( + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) + uploader_id = self._search_regex( + r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) + description = self._og_search_description(webpage) + view_count = str_to_int(self._search_regex( + [r'([0-9,.]+)', + r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], + webpage, 'play count', default=None)) + + else: + title = info_json['name'] + thumbnail = try_get(info_json, + lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + uploader = try_get(info_json, lambda x: x['owner']['displayName']) + uploader_id = try_get(info_json, lambda x: x['owner']['username']) + description = try_get(info_json, lambda x: x['description']) + view_count = try_get(info_json, lambda x: x['plays']) + + stream_info = info_json['streamInfo'] + formats = [] + self._decrypt_and_extend(stream_info, 'url', lambda x: [{ + 'format_id': 'normal', + 'url': x + }], key, formats) + self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, + formats) + self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, + formats) return { 'id': track_id, 'title': title, - 'url': song_url, + 'formats': formats, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, -- cgit v1.1 From 095774e59130c999ed8ce132f80a7164c5ee39a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 05:35:55 +0700 Subject: [mixcloud] Improve and simplify (closes #14132) --- youtube_dl/extractor/mixcloud.py | 71 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 4811823..f331db8 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_chr, compat_ord, + compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_zip @@ -16,9 +17,12 @@ from ..compat import ( from ..utils import ( clean_html, ExtractorError, + int_or_none, OnDemandPagedList, str_to_int, - try_get) + try_get, + urljoin, +) class MixcloudIE(InfoExtractor): @@ -61,13 +65,6 @@ class MixcloudIE(InfoExtractor): compat_chr(compat_ord(ch) ^ compat_ord(k)) for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) - @staticmethod - def _decrypt_and_extend(stream_info, url_key, getter, key, formats): - maybe_url = stream_info.get(url_key) - if maybe_url is not None: - decrypted = MixcloudIE._decrypt_xor_cipher(key, base64.b64decode(maybe_url)) - formats.extend(getter(decrypted)) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) @@ -86,9 +83,12 @@ class MixcloudIE(InfoExtractor): else: # New path full_info_json = self._parse_json(self._html_search_regex( - r'', webpage, 'play info'), 'play info') + r'', + webpage, 'play info'), 'play info') for item in full_info_json: - item_data = try_get(item, lambda x: x['cloudcast']['data']['cloudcastLookup']) + item_data = try_get( + item, lambda x: x['cloudcast']['data']['cloudcastLookup'], + dict) if try_get(item_data, lambda x: x['streamInfo']['url']): info_json = item_data break @@ -100,13 +100,9 @@ class MixcloudIE(InfoExtractor): webpage, 'error message', default=None) js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)', - webpage, 'js url', default=None) - if js_url is None: - js_url = self._search_regex( - r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js/www\.[^>]+\.js)', - webpage, 'js url') - js = self._download_webpage(js_url, track_id) + r']+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)', + webpage, 'js url') + js = self._download_webpage(js_url, track_id, 'Downloading JS') # Known plaintext attack if encrypted_play_info: kps = ['{"stream_url":'] @@ -117,8 +113,9 @@ class MixcloudIE(InfoExtractor): for kp in kps: partial_key = self._decrypt_xor_cipher(kpa_target, kp) for quote in ["'", '"']: - key = self._search_regex(r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), js, - "encryption key", default=None) + key = self._search_regex( + r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)), + js, 'encryption key', default=None) if key is not None: break else: @@ -153,23 +150,37 @@ class MixcloudIE(InfoExtractor): else: title = info_json['name'] - thumbnail = try_get(info_json, - lambda x: 'https://thumbnailer.mixcloud.com/unsafe/600x600/' + x['picture']['urlRoot']) + thumbnail = urljoin( + 'https://thumbnailer.mixcloud.com/unsafe/600x600/', + try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str)) uploader = try_get(info_json, lambda x: x['owner']['displayName']) uploader_id = try_get(info_json, lambda x: x['owner']['username']) description = try_get(info_json, lambda x: x['description']) - view_count = try_get(info_json, lambda x: x['plays']) + view_count = int_or_none(try_get(info_json, lambda x: x['plays'])) stream_info = info_json['streamInfo'] formats = [] - self._decrypt_and_extend(stream_info, 'url', lambda x: [{ - 'format_id': 'normal', - 'url': x - }], key, formats) - self._decrypt_and_extend(stream_info, 'hlsUrl', lambda x: self._extract_m3u8_formats(x, title), key, - formats) - self._decrypt_and_extend(stream_info, 'dashUrl', lambda x: self._extract_mpd_formats(x, title), key, - formats) + + for url_key in ('url', 'hlsUrl', 'dashUrl'): + format_url = stream_info.get(url_key) + if not format_url: + continue + decrypted = self._decrypt_xor_cipher(key, base64.b64decode(format_url)) + if not decrypted: + continue + if url_key == 'hlsUrl': + formats.extend(self._extract_m3u8_formats( + decrypted, track_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif url_key == 'dashUrl': + formats.extend(self._extract_mpd_formats( + decrypted, track_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': 'http', + 'url': decrypted, + }) + self._sort_formats(formats) return { 'id': track_id, -- cgit v1.1 From 9ce1ac404648142139f8b231c674d434ad4f9ffe Mon Sep 17 00:00:00 2001 From: kayb94 <30302445+kayb94@users.noreply.github.com> Date: Fri, 22 Sep 2017 22:49:48 +0000 Subject: [generic] Fix support for multiple HTML5 videos on one page (closes #14080) --- youtube_dl/extractor/generic.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b83c183..7d0edf0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1879,6 +1879,15 @@ class GenericIE(InfoExtractor): 'title': 'Building A Business Online: Principal Chairs Q & A', }, }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2849,13 +2858,20 @@ class GenericIE(InfoExtractor): # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: - for entry in entries: - entry.update({ + if len(entries) == 1: + entries[0].update({ 'id': video_id, 'title': video_title, }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: self._sort_formats(entry['formats']) - return self.playlist_result(entries) + return self.playlist_result(entries, video_id, video_title) jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) -- cgit v1.1 From 13de91c9e92bd831fee38fddbdabce7f6e82ef91 Mon Sep 17 00:00:00 2001 From: Dan Weber Date: Tue, 12 Sep 2017 22:52:54 -0400 Subject: [americastestkitchen] Add extractor (closes #10764) --- youtube_dl/extractor/americastestkitchen.py | 85 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 86 insertions(+) create mode 100755 youtube_dl/extractor/americastestkitchen.py diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py new file mode 100755 index 0000000..f231e7f --- /dev/null +++ b/youtube_dl/extractor/americastestkitchen.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/episode/(?P\d+)' + _TESTS = [{ + 'url': + 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '1_5g5zua6e', + 'title': 'atk_s17_e24.mp4', + 'ext': 'mp4', + 'description': '

Host Julia Collin Davison goes into the test kitchen with test cook Dan Souza to learn how to make the ultimate Grill-Roasted Beef Tenderloin. Next, equipment expert Adam Ried reviews gas grills in the Equipment Corner. Then, gadget guru Lisa McManus uncovers the best quirky gadgets. Finally, test cook Erin McMurrer shows host Bridget Lancaster how to make an elegant Pear-Walnut Upside-Down Cake.

', + 'timestamp': 1497285541, + 'upload_date': '20170612', + 'uploader_id': 'roger.metcalf@americastestkitchen.com', + 'release_date': '2017-06-17', + 'thumbnail': 'http://d3cizcpymoenau.cloudfront.net/images/35973/e24-tenderloin-16.jpg', + 'episode_number': 24, + 'episode': 'Summer Dinner Party', + 'episode_id': '548-summer-dinner-party', + 'season_number': 17 + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': + 'https://www.americastestkitchen.com/episode/546-a-spanish-affair', + 'only_matching': + True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + partner_id = self._search_regex( + r'partner_id/(?P\d+)', + webpage, + 'partner_id', + group='partner_id') + + video_data = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?});\s*', + webpage, 'initial context'), + video_id) + + episode_data = video_data['episodeDetail']['content']['data'] + episode_content_meta = episode_data['full_video'] + external_id = episode_content_meta['external_id'] + + # photo data + photo_data = episode_content_meta.get('photo') + thumbnail = photo_data.get('image_url') if photo_data else None + + # meta + release_date = episode_data.get('aired_at') + description = episode_content_meta.get('description') + episode_number = int(episode_content_meta.get('episode_number')) + episode = episode_content_meta.get('title') + episode_id = episode_content_meta.get('episode_slug') + season_number = int(episode_content_meta.get('season_number')) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, external_id), + 'ie_key': 'Kaltura', + 'id': video_id, + 'release_date': release_date, + 'thumbnail': thumbnail, + 'description': description, + 'episode_number': episode_number, + 'episode': episode, + 'episode_id': episode_id, + 'season_number': season_number + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ab95c85..5853005 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -39,6 +39,7 @@ from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE +from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anvato import AnvatoIE -- cgit v1.1 From 4bb58fa118a8c75b2ecf05f7b29a0ae27eef6239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 06:28:46 +0700 Subject: [americastestkitchen] Improve (closes #13996) --- youtube_dl/extractor/americastestkitchen.py | 82 ++++++++++++++--------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index f231e7f..0173687 100755 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -1,85 +1,85 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + try_get, + unified_strdate, +) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/episode/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' _TESTS = [{ - 'url': - 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '1_5g5zua6e', - 'title': 'atk_s17_e24.mp4', + 'title': 'Summer Dinner Party', 'ext': 'mp4', - 'description': '

Host Julia Collin Davison goes into the test kitchen with test cook Dan Souza to learn how to make the ultimate Grill-Roasted Beef Tenderloin. Next, equipment expert Adam Ried reviews gas grills in the Equipment Corner. Then, gadget guru Lisa McManus uncovers the best quirky gadgets. Finally, test cook Erin McMurrer shows host Bridget Lancaster how to make an elegant Pear-Walnut Upside-Down Cake.

', + 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1497285541, 'upload_date': '20170612', 'uploader_id': 'roger.metcalf@americastestkitchen.com', - 'release_date': '2017-06-17', - 'thumbnail': 'http://d3cizcpymoenau.cloudfront.net/images/35973/e24-tenderloin-16.jpg', - 'episode_number': 24, + 'release_date': '20170617', + 'series': "America's Test Kitchen", + 'season_number': 17, 'episode': 'Summer Dinner Party', - 'episode_id': '548-summer-dinner-party', - 'season_number': 17 + 'episode_number': 24, }, 'params': { - # m3u8 download 'skip_download': True, }, }, { - 'url': - 'https://www.americastestkitchen.com/episode/546-a-spanish-affair', - 'only_matching': - True, + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) partner_id = self._search_regex( - r'partner_id/(?P\d+)', - webpage, - 'partner_id', - group='partner_id') + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') video_data = self._parse_json( self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?});\s*', + r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', webpage, 'initial context'), video_id) - episode_data = video_data['episodeDetail']['content']['data'] - episode_content_meta = episode_data['full_video'] - external_id = episode_content_meta['external_id'] + ep_data = try_get( + video_data, + (lambda x: x['episodeDetail']['content']['data'], + lambda x: x['videoDetail']['content']['data']), dict) + ep_meta = ep_data.get('full_video', {}) + external_id = ep_data.get('external_id') or ep_meta['external_id'] - # photo data - photo_data = episode_content_meta.get('photo') - thumbnail = photo_data.get('image_url') if photo_data else None + title = ep_data.get('title') or ep_meta.get('title') + description = clean_html(ep_meta.get('episode_description') or ep_data.get( + 'description') or ep_meta.get('description')) + thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) + release_date = unified_strdate(ep_data.get('aired_at')) - # meta - release_date = episode_data.get('aired_at') - description = episode_content_meta.get('description') - episode_number = int(episode_content_meta.get('episode_number')) - episode = episode_content_meta.get('title') - episode_id = episode_content_meta.get('episode_slug') - season_number = int(episode_content_meta.get('season_number')) + season_number = int_or_none(ep_meta.get('season_number')) + episode = ep_meta.get('title') + episode_number = int_or_none(ep_meta.get('episode_number')) return { '_type': 'url_transparent', 'url': 'kaltura:%s:%s' % (partner_id, external_id), 'ie_key': 'Kaltura', - 'id': video_id, - 'release_date': release_date, - 'thumbnail': thumbnail, + 'title': title, 'description': description, - 'episode_number': episode_number, + 'thumbnail': thumbnail, + 'release_date': release_date, + 'series': "America's Test Kitchen", + 'season_number': season_number, 'episode': episode, - 'episode_id': episode_id, - 'season_number': season_number + 'episode_number': episode_number, } -- cgit v1.1 From 5c1452e8f1e744db14be1baef840e9f531e8f144 Mon Sep 17 00:00:00 2001 From: Giuseppe Fabiano Date: Sat, 23 Sep 2017 01:38:09 +0200 Subject: [twitter] Add support for user_id-less URLs (closes #14270) --- youtube_dl/extractor/twitter.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 7399cf5..0df3ad7 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -242,8 +242,9 @@ class TwitterCardIE(TwitterBaseIE): class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P[^/]+))/status/(?P\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' + _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -322,9 +323,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'FilmDrunk - Vine of the day', - 'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'FilmDrunk', + 'title': 'Vince Mancini - Vine of the day', + 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', + 'uploader': 'Vince Mancini', 'uploader_id': 'Filmdrunk', 'timestamp': 1402826626, 'upload_date': '20140615', @@ -372,6 +373,21 @@ class TwitterIE(InfoExtractor): 'params': { 'format': 'best[format_id^=http-]', }, + }, { + 'url': 'https://twitter.com/i/web/status/910031516746514432', + 'info_dict': { + 'id': '910031516746514432', + 'ext': 'mp4', + 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'uploader': 'Préfet de Guadeloupe', + 'uploader_id': 'Prefet971', + 'duration': 47.48, + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, }] def _real_extract(self, url): @@ -380,11 +396,15 @@ class TwitterIE(InfoExtractor): twid = mobj.group('id') webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_URL % (user_id, twid), twid) + self._TEMPLATE_STATUSES_URL % twid, twid) if 'twitter.com/account/suspended' in urlh.geturl(): raise ExtractorError('Account suspended by Twitter.', expected=True) + if user_id is None: + mobj = re.match(self._VALID_URL, urlh.geturl()) + user_id = mobj.group('user_id') + username = remove_end(self._og_search_title(webpage), ' on Twitter') title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') -- cgit v1.1 From 1c22d7a7f30917abfd2b7495f7bd02d51cb8528a Mon Sep 17 00:00:00 2001 From: Namnamseo <0201ssw+github@gmail.com> Date: Thu, 24 Aug 2017 11:32:24 +0900 Subject: [kakao] Add extractor (closes #12298) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kakao.py | 140 +++++++++++++++++++++++++++++++++++++ 2 files changed, 141 insertions(+) create mode 100644 youtube_dl/extractor/kakao.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5853005..4232a4f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -483,6 +483,7 @@ from .jove import JoveIE from .joj import JojIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE +from .kakao import KakaoIE from .kaltura import KalturaIE from .kamcord import KamcordIE from .kanalplay import KanalPlayIE diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py new file mode 100644 index 0000000..0caa41e --- /dev/null +++ b/youtube_dl/extractor/kakao.py @@ -0,0 +1,140 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + compat_str, + unified_timestamp, +) + + +class KakaoIE(InfoExtractor): + _VALID_URL = r'https?://tv.kakao.com/channel/(?P\d+)/cliplink/(?P\d+)' + IE_NAME = 'kakao.com' + + _TESTS = [{ + 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', + 'md5': '702b2fbdeb51ad82f5c904e8c0766340', + 'info_dict': { + 'id': '301965083', + 'ext': 'mp4', + 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', + 'uploader_id': 2671005, + 'uploader': '그랑그랑이', + 'timestamp': 1488160199, + 'upload_date': '20170227', + } + }, { + 'url': 'http://tv.kakao.com/channel/2653210/cliplink/300103180', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': '300103180', + 'ext': 'mp4', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', + 'uploader_id': 2653210, + 'uploader': '쇼 음악중심', + 'timestamp': 1485684628, + 'upload_date': '20170129', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + player_url = 'http://tv.kakao.com/embed/player/cliplink/' + video_id + \ + '?service=kakao_tv&autoplay=1&profile=HIGH&wmode=transparent' + player_header = {'Referer': player_url} + + impress = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/impress' % video_id, + video_id, 'Downloading video info', + query={ + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + 'fields': 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' + }, headers=player_header) + + clipLink = impress['clipLink'] + clip = clipLink['clip'] + + video_info = { + 'id': video_id, + 'title': clip['title'], + 'description': clip.get('description'), + 'uploader': clipLink.get('channel', {}).get('name'), + 'uploader_id': clipLink.get('channelId'), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + } + + tid = impress.get('tid', '') + raw = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw' % video_id, + video_id, 'Downloading video formats info', + query={ + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': 'HIGH', + 'dteType': 'PC', + }, headers=player_header, fatal=False) + + formats = [] + for fmt in raw.get('outputList', []): + try: + profile_name = fmt['profile'] + fmt_url_json = self._download_json( + 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw/videolocation' % video_id, + video_id, 'Downloading video URL for profile %s' % profile_name, + query={ + 'service': 'kakao_tv', + 'section': '', + 'tid': tid, + 'profile': profile_name + }, headers=player_header, fatal=False) + + if fmt_url_json is None: + continue + + fmt_url = fmt_url_json['url'] + formats.append({ + 'url': fmt_url, + 'format_id': profile_name, + 'width': int_or_none(fmt.get('width')), + 'height': int_or_none(fmt.get('height')), + 'format_note': fmt.get('label'), + 'filesize': int_or_none(fmt.get('filesize')) + }) + except KeyError: + pass + + self._sort_formats(formats) + video_info['formats'] = formats + + top_thumbnail = clip.get('thumbnailUrl') + thumbs = [] + for thumb in clip.get('clipChapterThumbnailList', []): + thumbs.append({ + 'url': thumb.get('thumbnailUrl'), + 'id': compat_str(thumb.get('timeInSec')), + 'preference': -1 if thumb.get('isDefault') else 0 + }) + video_info['thumbnail'] = top_thumbnail + video_info['thumbnails'] = thumbs + + upload_date = unified_timestamp(clipLink.get('createTime')) + video_info['timestamp'] = upload_date + + return video_info -- cgit v1.1 From f70ddd4aebbfb0bdf2f63c1eba5b5614d2cfb70f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:25:15 +0700 Subject: [kakao] Improve (closes #14007) --- youtube_dl/extractor/kakao.py | 113 +++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py index 0caa41e..c9b438e 100644 --- a/youtube_dl/extractor/kakao.py +++ b/youtube_dl/extractor/kakao.py @@ -3,16 +3,17 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, - compat_str, unified_timestamp, + update_url_query, ) class KakaoIE(InfoExtractor): _VALID_URL = r'https?://tv.kakao.com/channel/(?P\d+)/cliplink/(?P\d+)' - IE_NAME = 'kakao.com' + _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks' _TESTS = [{ 'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083', @@ -44,60 +45,57 @@ class KakaoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - player_url = 'http://tv.kakao.com/embed/player/cliplink/' + video_id + \ - '?service=kakao_tv&autoplay=1&profile=HIGH&wmode=transparent' - player_header = {'Referer': player_url} + player_header = { + 'Referer': update_url_query( + 'http://tv.kakao.com/embed/player/cliplink/%s' % video_id, { + 'service': 'kakao_tv', + 'autoplay': '1', + 'profile': 'HIGH', + 'wmode': 'transparent', + }) + } + + QUERY_COMMON = { + 'player': 'monet_html5', + 'referer': url, + 'uuid': '', + 'service': 'kakao_tv', + 'section': '', + 'dteType': 'PC', + } + query = QUERY_COMMON.copy() + query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' impress = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/impress' % video_id, + '%s/%s/impress' % (self._API_BASE, video_id), video_id, 'Downloading video info', - query={ - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'dteType': 'PC', - 'fields': 'clipLink,clip,channel,hasPlusFriend,-service,-tagList' - }, headers=player_header) - - clipLink = impress['clipLink'] - clip = clipLink['clip'] - - video_info = { - 'id': video_id, - 'title': clip['title'], - 'description': clip.get('description'), - 'uploader': clipLink.get('channel', {}).get('name'), - 'uploader_id': clipLink.get('channelId'), - 'duration': int_or_none(clip.get('duration')), - 'view_count': int_or_none(clip.get('playCount')), - 'like_count': int_or_none(clip.get('likeCount')), - 'comment_count': int_or_none(clip.get('commentCount')), - } + query=query, headers=player_header) + + clip_link = impress['clipLink'] + clip = clip_link['clip'] + + title = clip.get('title') or clip_link.get('displayTitle') tid = impress.get('tid', '') + + query = QUERY_COMMON.copy() + query.update({ + 'tid': tid, + 'profile': 'HIGH', + }) raw = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw' % video_id, + '%s/%s/raw' % (self._API_BASE, video_id), video_id, 'Downloading video formats info', - query={ - 'player': 'monet_html5', - 'referer': url, - 'uuid': '', - 'service': 'kakao_tv', - 'section': '', - 'tid': tid, - 'profile': 'HIGH', - 'dteType': 'PC', - }, headers=player_header, fatal=False) + query=query, headers=player_header) formats = [] for fmt in raw.get('outputList', []): try: profile_name = fmt['profile'] fmt_url_json = self._download_json( - 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/raw/videolocation' % video_id, - video_id, 'Downloading video URL for profile %s' % profile_name, + '%s/%s/raw/videolocation' % (self._API_BASE, video_id), + video_id, + 'Downloading video URL for profile %s' % profile_name, query={ 'service': 'kakao_tv', 'section': '', @@ -119,11 +117,8 @@ class KakaoIE(InfoExtractor): }) except KeyError: pass - self._sort_formats(formats) - video_info['formats'] = formats - top_thumbnail = clip.get('thumbnailUrl') thumbs = [] for thumb in clip.get('clipChapterThumbnailList', []): thumbs.append({ @@ -131,10 +126,24 @@ class KakaoIE(InfoExtractor): 'id': compat_str(thumb.get('timeInSec')), 'preference': -1 if thumb.get('isDefault') else 0 }) - video_info['thumbnail'] = top_thumbnail - video_info['thumbnails'] = thumbs - - upload_date = unified_timestamp(clipLink.get('createTime')) - video_info['timestamp'] = upload_date + top_thumbnail = clip.get('thumbnailUrl') + if top_thumbnail: + thumbs.append({ + 'url': top_thumbnail, + 'preference': 10, + }) - return video_info + return { + 'id': video_id, + 'title': title, + 'description': clip.get('description'), + 'uploader': clip_link.get('channel', {}).get('name'), + 'uploader_id': clip_link.get('channelId'), + 'thumbnails': thumbs, + 'timestamp': unified_timestamp(clip_link.get('createTime')), + 'duration': int_or_none(clip.get('duration')), + 'view_count': int_or_none(clip.get('playCount')), + 'like_count': int_or_none(clip.get('likeCount')), + 'comment_count': int_or_none(clip.get('commentCount')), + 'formats': formats, + } -- cgit v1.1 From 7f4921b38d10c17fe354bab20b741b362c5ae0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:26:40 +0700 Subject: [heise] PEP 8 --- youtube_dl/extractor/heise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 495ffb7..82e11a7 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -60,8 +60,8 @@ class HeiseIE(InfoExtractor): title = self._html_search_meta('fulltitle', webpage, default=None) if not title or title == "c't": title = self._search_regex( - r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', - webpage, 'title') + r']+class="videoplayerjw"[^>]+data-title="([^"]+)"', + webpage, 'title') yt_urls = YoutubeIE._extract_urls(webpage) if yt_urls: -- cgit v1.1 From 136507b39a2b48cb775249e9724eeeedb56baed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:41:22 +0700 Subject: [24video] Add support for 24video.adult (closes #14295) --- youtube_dl/extractor/twentyfourvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 7af1165..cc51ca0 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -14,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube|adult))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', -- cgit v1.1 From e3440d824a7326d0ba609d8f0896203208ecc558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 Sep 2017 07:42:17 +0700 Subject: [24video] Fix timestamp extraction and make non fatal (#14295) --- youtube_dl/extractor/twentyfourvideo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index cc51ca0..96e0b96 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -60,8 +60,8 @@ class TwentyFourVideoIE(InfoExtractor): duration = int_or_none(self._og_search_property( 'duration', webpage, 'duration', fatal=False)) timestamp = parse_iso8601(self._search_regex( - r'