[motherless:group] Relax entry extraction and add a fallback scenario

author: Sergey M․ <dstftw@gmail.com> 2018-01-07 00:31:53 +0700
committer: Sergey M․ <dstftw@gmail.com> 2018-01-07 00:31:53 +0700
commit: 0a5b1295b7c1aa6395b65ee137087c540b37b32b (patch)
tree: 296c9dd21a73afc359c5c6d6ceed9b43673d6b3e /youtube_dl/extractor/motherless.py
parent: a133eb7764594b830cb975e3925972214e932704 (diff)
1 files changed, 21 insertions, 8 deletions
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py
index 4adac691c..e24396e79 100644
--- a/youtube_dl/extractor/motherless.py
+++ b/youtube_dl/extractor/motherless.py
@@ -148,14 +148,27 @@ class MotherlessGroupIE(InfoExtractor):
                 else super(MotherlessGroupIE, cls).suitable(url))
 
     def _extract_entries(self, webpage, base):
-        return [
-            self.url_result(
-                compat_urlparse.urljoin(base, video_path),
-                MotherlessIE.ie_key(), video_title=title)
-            for video_path, title in orderedSet(re.findall(
-                r'href="(/[^"]+)"[^>]+>\s+<img[^>]+alt="[^-]+-\s([^"]+)"',
-                webpage))
-        ]
+        entries = []
+        for mobj in re.finditer(
+                r'href="(?P<href>/[^"]+)"[^>]*>(?:\s*<img[^>]+alt="[^-]+-\s(?P<title>[^"]+)")?',
+                webpage):
+            video_url = compat_urlparse.urljoin(base, mobj.group('href'))
+            if not MotherlessIE.suitable(video_url):
+                continue
+            video_id = MotherlessIE._match_id(video_url)
+            title = mobj.group('title')
+            entries.append(self.url_result(
+                video_url, ie=MotherlessIE.ie_key(), video_id=video_id,
+                video_title=title))
+        # Alternative fallback
+        if not entries:
+            entries = [
+                self.url_result(
+                    compat_urlparse.urljoin(base, '/' + video_id),
+                    ie=MotherlessIE.ie_key(), video_id=video_id)
+                for video_id in orderedSet(re.findall(
+                    r'data-codename=["\']([A-Z0-9]+)', webpage))]
+        return entries
 
     def _real_extract(self, url):
         group_id = self._match_id(url)
author	Sergey M․ <dstftw@gmail.com>	2018-01-07 00:31:53 +0700
committer	Sergey M․ <dstftw@gmail.com>	2018-01-07 00:31:53 +0700
commit	0a5b1295b7c1aa6395b65ee137087c540b37b32b (patch)
tree	296c9dd21a73afc359c5c6d6ceed9b43673d6b3e /youtube_dl/extractor/motherless.py
parent	a133eb7764594b830cb975e3925972214e932704 (diff)