diff options
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 10 | ||||
| -rw-r--r-- | youtube_dl/extractor/youporn.py | 438 | 
2 files changed, 447 insertions, 1 deletions
| diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a56a7c52f..03d035a27 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1653,7 +1653,15 @@ from .younow import (      YouNowChannelIE,      YouNowMomentIE,  ) -from .youporn import YouPornIE +from .youporn import ( +    YouPornIE, +    YouPornCategoryIE, +    YouPornChannelIE, +    YouPornCollectionIE, +    YouPornStarIE, +    YouPornTagIE, +    YouPornVideosIE, +)  from .yourporn import YourPornIE  from .yourupload import YourUploadIE  from .youtube import ( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 8f2175dde..ec6125a79 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,21 +1,26 @@  # coding: utf-8  from __future__ import unicode_literals +import itertools  import re +from time import sleep  from .common import InfoExtractor  from ..utils import (      clean_html,      extract_attributes,      ExtractorError, +    get_element_by_class,      get_element_by_id,      int_or_none,      merge_dicts,      parse_count, +    parse_qs,      T,      traverse_obj,      unified_strdate,      url_or_none, +    urljoin,  ) @@ -254,3 +259,436 @@ class YouPornIE(InfoExtractor):                  'Watch %s online' % (result['title'],)):              del result['description']          return result + + +class YouPornListBase(InfoExtractor): +    # pattern in '.title-text' element of page section containing videos +    _PLAYLIST_TITLEBAR_RE = r'\s+[Vv]ideos\s*$' +    _PAGE_RETRY_COUNT = 0  # ie, no retry +    _PAGE_RETRY_DELAY = 2  # seconds + +    def _get_next_url(self, url, pl_id, html): +        return urljoin(url, self._search_regex( +            r'''<a\s[^>]*?\bhref\s*=\s*("|')(?P<url>(?:(?!\1)[^>])+)\1''', +            get_element_by_id('next', html) or '', 'next page', +            group='url', default=None)) + +    @classmethod +    def _get_title_from_slug(cls, title_slug): +        return re.sub(r'[_-]', ' ', title_slug) + +    def _entries(self, url, pl_id, html=None, page_num=None): + +        # separates page sections +        PLAYLIST_SECTION_RE = ( +            r'''<div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?title-bar(?:\s+[\w$-]+|\s)*\1[^>]*>''' +        ) +        # contains video link +        VIDEO_URL_RE = r'''(?x) +            <div\s[^>]*\bdata-video-id\s*=\s*('|")\d+\1[^>]*>\s* +            (?:<div\b[\s\S]+?</div>\s*)* +            <a\s[^>]*\bhref\s*=\s*('|")(?P<url>(?:(?!\2)[^>])+)\2 +        ''' + +        def yield_pages(url, html=html, page_num=page_num): +            fatal = not html +            for pnum in itertools.count(start=page_num or 1): +                if not html: +                    html = self._download_webpage( +                        url, pl_id, note='Downloading page %d' % pnum, +                        fatal=fatal) +                if not html: +                    break +                fatal = False +                yield (url, html, pnum) +                # explicit page: extract just that page +                if page_num is not None: +                    break +                next_url = self._get_next_url(url, pl_id, html) +                if not next_url or next_url == url: +                    break +                url, html = next_url, None + +        def retry_page(msg, tries_left, page_data): +            if tries_left <= 0: +                return +            self.report_warning(msg, pl_id) +            sleep(self._PAGE_RETRY_DELAY) +            return next( +                yield_pages(page_data[0], page_num=page_data[2]), None) + +        def yield_entries(html): +            for frag in re.split(PLAYLIST_SECTION_RE, html): +                if not frag: +                    continue +                t_text = get_element_by_class('title-text', frag or '') +                if not (t_text and re.search(self._PLAYLIST_TITLEBAR_RE, t_text)): +                    continue +                for m in re.finditer(VIDEO_URL_RE, frag): +                    video_url = urljoin(url, m.group('url')) +                    if video_url: +                        yield self.url_result(video_url) + +        last_first_url = None +        for page_data in yield_pages(url, html=html, page_num=page_num): +            # page_data: url, html, page_num +            first_url = None +            tries_left = self._PAGE_RETRY_COUNT + 1 +            while tries_left > 0: +                tries_left -= 1 +                for from_ in yield_entries(page_data[1]): +                    # may get the same page twice instead of empty page +                    # or (site bug) intead of actual next page +                    if not first_url: +                        first_url = from_['url'] +                        if first_url == last_first_url: +                            # sometimes (/porntags/) the site serves the previous page +                            # instead but may provide the correct page after a delay +                            page_data = retry_page( +                                'Retrying duplicate page...', tries_left, page_data) +                            if page_data: +                                first_url = None +                                break +                            continue +                    yield from_ +                else: +                    if not first_url and 'no-result-paragarph1' in page_data[1]: +                        page_data = retry_page( +                            'Retrying empty page...', tries_left, page_data) +                        if page_data: +                            continue +                    else: +                        # success/failure +                        break +            # may get an infinite (?) sequence of empty pages +            if not first_url: +                break +            last_first_url = first_url + +    def _real_extract(self, url, html=None): +        # exceptionally, id may be None +        m_dict = self._match_valid_url(url).groupdict() +        pl_id, page_type, sort = (m_dict.get(k) for k in ('id', 'type', 'sort')) + +        qs = parse_qs(url) +        for q, v in qs.items(): +            if v: +                qs[q] = v[-1] +            else: +                del qs[q] + +        base_id = pl_id or 'YouPorn' +        title = self._get_title_from_slug(base_id) +        if page_type: +            title = '%s %s' % (page_type.capitalize(), title) +        base_id = [base_id.lower()] +        if sort is None: +            title += ' videos' +        else: +            title = '%s videos by %s' % (title, re.sub(r'[_-]', ' ', sort)) +            base_id.append(sort) +        if qs: +            ps = ['%s=%s' % item for item in sorted(qs.items())] +            title += ' (%s)' % ','.join(ps) +            base_id.extend(ps) +        pl_id = '/'.join(base_id) + +        return self.playlist_result( +            self._entries(url, pl_id, html=html, +                          page_num=int_or_none(qs.get('page'))), +            playlist_id=pl_id, playlist_title=title) + + +class YouPornCategoryIE(YouPornListBase): +    IE_DESC = 'YouPorn category, with sorting, filtering and pagination' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?youporn\.com/ +        (?P<type>category)/(?P<id>[^/?#&]+) +        (?:/(?P<sort>popular|views|rating|time|duration))?/?(?:[#?]|$) +    ''' +    _TESTS = [{ +        'note': 'Full list with pagination', +        'url': 'https://www.youporn.com/category/lingerie/popular/', +        'info_dict': { +            'id': 'lingerie/popular', +            'title': 'Category lingerie videos by popular', +        }, +        'playlist_mincount': 39, +    }, { +        'note': 'Filtered paginated list with single page result', +        'url': 'https://www.youporn.com/category/lingerie/duration/?min_minutes=10', +        'info_dict': { +            'id': 'lingerie/duration/min_minutes=10', +            'title': 'Category lingerie videos by duration (min_minutes=10)', +        }, +        'playlist_maxcount': 30, +    }, { +        'note': 'Single page of full list', +        'url': 'https://www.youporn.com/category/lingerie/popular?page=1', +        'info_dict': { +            'id': 'lingerie/popular/page=1', +            'title': 'Category lingerie videos by popular (page=1)', +        }, +        'playlist_count': 30, +    }] + + +class YouPornChannelIE(YouPornListBase): +    IE_DESC = 'YouPorn channel, with sorting and pagination' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?youporn\.com/ +        (?P<type>channel)/(?P<id>[^/?#&]+) +        (?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$) +    ''' +    _TESTS = [{ +        'note': 'Full list with pagination', +        'url': 'https://www.youporn.com/channel/x-feeds/', +        'info_dict': { +            'id': 'x-feeds', +            'title': 'Channel X-Feeds videos', +        }, +        'playlist_mincount': 37, +    }, { +        'note': 'Single page of full list (no filters here)', +        'url': 'https://www.youporn.com/channel/x-feeds/duration?page=1', +        'info_dict': { +            'id': 'x-feeds/duration/page=1', +            'title': 'Channel X-Feeds videos by duration (page=1)', +        }, +        'playlist_count': 24, +    }] + +    @staticmethod +    def _get_title_from_slug(title_slug): +        return re.sub(r'_', ' ', title_slug).title() + + +class YouPornCollectionIE(YouPornListBase): +    IE_DESC = 'YouPorn collection (user playlist), with sorting and pagination' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?youporn\.com/ +        (?P<type>collection)s/videos/(?P<id>\d+) +        (?:/(?P<sort>rating|views|time|duration))?/?(?:[#?]|$) +    ''' +    _PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+in\s' +    _TESTS = [{ +        'note': 'Full list with pagination', +        'url': 'https://www.youporn.com/collections/videos/33044251/', +        'info_dict': { +            'id': '33044251', +            'title': 'Collection Sexy Lips videos', +            'uploader': 'ph-littlewillyb', +        }, +        'playlist_mincount': 50, +    }, { +        'note': 'Single page of full list (no filters here)', +        'url': 'https://www.youporn.com/collections/videos/33044251/time?page=1', +        'info_dict': { +            'id': '33044251/time/page=1', +            'title': 'Collection Sexy Lips videos by time (page=1)', +            'uploader': 'ph-littlewillyb', +        }, +        'playlist_count': 20, +    }] + +    def _real_extract(self, url): +        pl_id = self._match_id(url) +        html = self._download_webpage(url, pl_id) +        playlist = super(YouPornCollectionIE, self)._real_extract(url, html=html) +        infos = re.sub(r'\s+', ' ', clean_html(get_element_by_class( +            'collection-infos', html)) or '') +        title, uploader = self._search_regex( +            r'^\s*Collection: (?P<title>.+?) \d+ VIDEOS \d+ VIEWS \d+ days LAST UPDATED From: (?P<uploader>[\w_-]+)', +            infos, 'title/uploader', group=('title', 'uploader'), default=(None, None)) + +        return merge_dicts({ +            'title': playlist['title'].replace(playlist['id'].split('/')[0], title), +            'uploader': uploader, +        }, playlist) if title else playlist + + +class YouPornTagIE(YouPornListBase): +    IE_DESC = 'YouPorn tag (porntags), with sorting, filtering and pagination' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?youporn\.com/ +        porn(?P<type>tag)s/(?P<id>[^/?#&]+) +        (?:/(?P<sort>views|rating|time|duration))?/?(?:[#?]|$) +    ''' +    _PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+tagged\s' +    _PAGE_RETRY_COUNT = 1 +    _TESTS = [{ +        'note': 'Full list with pagination', +        'url': 'https://www.youporn.com/porntags/austrian', +        'info_dict': { +            'id': 'austrian', +            'title': 'Tag austrian videos', +        }, +        'playlist_mincount': 35, +        'expected_warnings': ['Retrying duplicate page'], +    }, { +        'note': 'Filtered paginated list with single page result', +        'url': 'https://www.youporn.com/porntags/austrian/duration/?min_minutes=10', +        'info_dict': { +            'id': 'austrian/duration/min_minutes=10', +            'title': 'Tag austrian videos by duration (min_minutes=10)', +        }, +        # number of videos per page is (row x col) 2x3 + 6x4 + 2, or + 3, +        # or more, varying with number of ads; let's set max as 9x4 +        # NB col 1 may not be shown in non-JS page with site CSS and zoom 100% +        'playlist_maxcount': 32, +        'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'], +    }, { +        'note': 'Single page of full list', +        'url': 'https://www.youporn.com/porntags/austrian/?page=1', +        'info_dict': { +            'id': 'austrian/page=1', +            'title': 'Tag austrian videos (page=1)', +        }, +        'playlist_mincount': 32, +        'playlist_maxcount': 34, +        'expected_warnings': ['Retrying duplicate page', 'Retrying empty page'], +    }] + +    # YP tag navigation is broken, loses sort +    def _get_next_url(self, url, pl_id, html): +        next_url = super(YouPornTagIE, self)._get_next_url(url, pl_id, html) +        if next_url: +            n = self._match_valid_url(next_url) +            if n: +                s = n.groupdict().get('sort') +            if s: +                u = self._match_valid_url(url) +                if u: +                    u = u.groupdict().get('sort') +                    if s and not u: +                        n = n.end('sort') +                        next_url = next_url[:n] + '/' + u + next_url[n:] +        return next_url + + +class YouPornStarIE(YouPornListBase): +    IE_DESC = 'YouPorn Pornstar, with description, sorting and pagination' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?youporn\.com/ +        (?P<type>pornstar)/(?P<id>[^/?#&]+) +        (?:/(?P<sort>rating|views|duration))?/?(?:[#?]|$) +    ''' +    _PLAYLIST_TITLEBAR_RE = r'^\s*Videos\s+[fF]eaturing\s' +    _TESTS = [{ +        'note': 'Full list with pagination', +        'url': 'https://www.youporn.com/pornstar/daynia/', +        'info_dict': { +            'id': 'daynia', +            'title': 'Pornstar Daynia videos', +            'description': r're:Daynia Rank \d+ Videos \d+ Views [\d,.]+ .+ Subscribers \d+', +        }, +        'playlist_mincount': 45, +    }, { +        'note': 'Single page of full list (no filters here)', +        'url': 'https://www.youporn.com/pornstar/daynia/?page=1', +        'info_dict': { +            'id': 'daynia/page=1', +            'title': 'Pornstar Daynia videos (page=1)', +            'description': 're:.{180,}', +        }, +        'playlist_count': 26, +    }] + +    @staticmethod +    def _get_title_from_slug(title_slug): +        return re.sub(r'_', ' ', title_slug).title() + +    def _real_extract(self, url): +        pl_id = self._match_id(url) +        html = self._download_webpage(url, pl_id) +        playlist = super(YouPornStarIE, self)._real_extract(url, html=html) +        INFO_ELEMENT_RE = r'''(?x) +            <div\s[^>]*\bclass\s*=\s*('|")(?:[\w$-]+\s+|\s)*?pornstar-info-wrapper(?:\s+[\w$-]+|\s)*\1[^>]*> +            (?P<info>[\s\S]+?)(?:</div>\s*){6,} +        ''' + +        infos = self._search_regex(INFO_ELEMENT_RE, html, 'infos', group='info', default='') +        if infos: +            infos = re.sub( +                r'(?:\s*nl=nl)+\s*', ' ', +                re.sub(r'(?u)\s+', ' ', clean_html( +                    re.sub('\n', 'nl=nl', infos)))).replace('ribe Subsc', '') + +        return merge_dicts({ +            'description': infos.strip() or None, +        }, playlist) + + +class YouPornVideosIE(YouPornListBase): +    IE_DESC = 'YouPorn video (browse) playlists, with sorting, filtering and pagination' +    _VALID_URL = r'''(?x) +        https?://(?:www\.)?youporn\.com/ +            (?:(?P<id>browse)/)? +            (?P<sort>(?(id) +                (?:duration|rating|time|views)| +                (?:most_(?:favou?rit|view)ed|recommended|top_rated)?)) +            (?:[/#?]|$) +    ''' +    _PLAYLIST_TITLEBAR_RE = r'\s+(?:[Vv]ideos|VIDEOS)\s*$' +    _TESTS = [{ +        'note': 'Full list with pagination (too long for test)', +        'url': 'https://www.youporn.com/', +        'info_dict': { +            'id': 'youporn', +            'title': 'YouPorn videos', +        }, +        'only_matching': True, +    }, { +        'note': 'Full list with pagination (too long for test)', +        'url': 'https://www.youporn.com/recommended', +        'info_dict': { +            'id': 'youporn/recommended', +            'title': 'YouPorn videos by recommended', +        }, +        'only_matching': True, +    }, { +        'note': 'Full list with pagination (too long for test)', +        'url': 'https://www.youporn.com/top_rated', +        'info_dict': { +            'id': 'youporn/top_rated', +            'title': 'YouPorn videos by top rated', +        }, +        'only_matching': True, +    }, { +        'note': 'Full list with pagination (too long for test)', +        'url': 'https://www.youporn.com/browse/time', +        'info_dict': { +            'id': 'browse/time', +            'title': 'YouPorn videos by time', +        }, +        'only_matching': True, +    }, { +        'note': 'Filtered paginated list with single page result', +        'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=2', +        'info_dict': { +            'id': 'youporn/most_favorited/max_minutes=2/res=VR', +            'title': 'YouPorn videos by most favorited (max_minutes=2,res=VR)', +        }, +        'playlist_mincount': 10, +        'playlist_maxcount': 28, +    }, { +        'note': 'Filtered paginated list with several pages', +        'url': 'https://www.youporn.com/most_favorited/?res=VR&max_minutes=5', +        'info_dict': { +            'id': 'youporn/most_favorited/max_minutes=5/res=VR', +            'title': 'YouPorn videos by most favorited (max_minutes=5,res=VR)', +        }, +        'playlist_mincount': 45, +    }, { +        'note': 'Single page of full list', +        'url': 'https://www.youporn.com/browse/time?page=1', +        'info_dict': { +            'id': 'browse/time/page=1', +            'title': 'YouPorn videos by time (page=1)', +        }, +        'playlist_count': 36, +    }] + +    @staticmethod +    def _get_title_from_slug(title_slug): +        return 'YouPorn' if title_slug == 'browse' else title_slug | 
