diff options
author | Sergey M․ <dstftw@gmail.com> | 2021-01-05 07:40:06 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2021-01-05 07:40:30 +0700 |
commit | ecae54a98d2a8d9300142bf3d586f31e8144ccd6 (patch) | |
tree | 55b35e12aeacfbf9a26d08cf8cb72decf2fa6261 | |
parent | f318882955b90bead8206ee411641e65037b1011 (diff) |
[motherless] Fix review issues and improve extraction (closes #26495, closes #27450)
-rw-r--r-- | youtube_dl/extractor/motherless.py | 52 |
1 files changed, 34 insertions, 18 deletions
diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6cc36b308..ef1e081f2 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Views<', # 1,234,567 Views - r'<strong>Views</strong>\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + (r'>([\d,.]+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago - r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date') - relative = re.match(r'(\d+)([hd])$', upload_date) - if relative: - delta = int(relative.group(1)) - unit = relative.group(2) - if unit == 'h': - delta_t = datetime.timedelta(hours=delta) - else: # unit == 'd' - delta_t = datetime.timedelta(days=delta) - upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( |