diff options
author | Sergey M․ <dstftw@gmail.com> | 2018-01-07 00:31:53 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2018-01-07 00:31:53 +0700 |
commit | 0a5b1295b7c1aa6395b65ee137087c540b37b32b (patch) | |
tree | 296c9dd21a73afc359c5c6d6ceed9b43673d6b3e | |
parent | a133eb7764594b830cb975e3925972214e932704 (diff) | |
download | youtube-dl-0a5b1295b7c1aa6395b65ee137087c540b37b32b.zip youtube-dl-0a5b1295b7c1aa6395b65ee137087c540b37b32b.tar.gz youtube-dl-0a5b1295b7c1aa6395b65ee137087c540b37b32b.tar.bz2 |
[motherless:group] Relax entry extraction and add a fallback scenario
-rw-r--r-- | youtube_dl/extractor/motherless.py | 29 |
1 files changed, 21 insertions, 8 deletions
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 4adac69..e24396e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -148,14 +148,27 @@ class MotherlessGroupIE(InfoExtractor): else super(MotherlessGroupIE, cls).suitable(url)) def _extract_entries(self, webpage, base): - return [ - self.url_result( - compat_urlparse.urljoin(base, video_path), - MotherlessIE.ie_key(), video_title=title) - for video_path, title in orderedSet(re.findall( - r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"', - webpage)) - ] + entries = [] + for mobj in re.finditer( + r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?', + webpage): + video_url = compat_urlparse.urljoin(base, mobj.group('href')) + if not MotherlessIE.suitable(video_url): + continue + video_id = MotherlessIE._match_id(video_url) + title = mobj.group('title') + entries.append(self.url_result( + video_url, ie=MotherlessIE.ie_key(), video_id=video_id, + video_title=title)) + # Alternative fallback + if not entries: + entries = [ + self.url_result( + compat_urlparse.urljoin(base, '/' + video_id), + ie=MotherlessIE.ie_key(), video_id=video_id) + for video_id in orderedSet(re.findall( + r'data-codename=["\']([A-Z0-9]+)', webpage))] + return entries def _real_extract(self, url): group_id = self._match_id(url) |