Merge branch 'UP/youtube-dl' into dl/YoutubeSearchURLIE

author: pukkandan <pukkandan.ytdlp@gmail.com> 2022-01-30 01:07:28 +0530
committer: pukkandan <pukkandan.ytdlp@gmail.com> 2022-01-30 01:07:28 +0530
commit: a3373da70c97d356bd4927eff403abd261dd8f9f (patch)
tree: fb77c8f297d7ca1bd6a6f3e37a4c0a4b12399db5 /youtube_dl/extractor/pornhub.py
parent: 2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 (diff)
parent: 5014bd67c22b421207b2650d4dc874b95b36dda1 (diff)
download: youtube-dl-a3373da70c97d356bd4927eff403abd261dd8f9f.tar.xz
1 files changed, 197 insertions, 70 deletions
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 2fcbd186f..e2e1500ff 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -22,11 +22,16 @@ from ..utils import (
     orderedSet,
     remove_quotes,
     str_to_int,
+    update_url_query,
+    urlencode_postdata,
     url_or_none,
 )
 
 
 class PornHubBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'pornhub'
+    _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
+
     def _download_webpage_handle(self, *args, **kwargs):
         def dl(*args, **kwargs):
             return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
@@ -52,17 +57,79 @@ class PornHubBaseIE(InfoExtractor):
 
         return webpage, urlh
 
+    def _real_initialize(self):
+        self._logged_in = False
+
+    def _login(self, host):
+        if self._logged_in:
+            return
+
+        site = host.split('.')[0]
+
+        # Both sites pornhub and pornhubpremium have separate accounts
+        # so there should be an option to provide credentials for both.
+        # At the same time some videos are available under the same video id
+        # on both sites so that we have to identify them as the same video.
+        # For that purpose we have to keep both in the same extractor
+        # but under different netrc machines.
+        username, password = self._get_login_info(netrc_machine=site)
+        if username is None:
+            return
+
+        login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '')
+        login_page = self._download_webpage(
+            login_url, None, 'Downloading %s login page' % site)
+
+        def is_logged(webpage):
+            return any(re.search(p, webpage) for p in (
+                r'class=["\']signOut',
+                r'>Sign\s+[Oo]ut\s*<'))
+
+        if is_logged(login_page):
+            self._logged_in = True
+            return
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'username': username,
+            'password': password,
+        })
+
+        response = self._download_json(
+            'https://www.%s/front/authenticate' % host, None,
+            'Logging in to %s' % site,
+            data=urlencode_postdata(login_form),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+                'Referer': login_url,
+                'X-Requested-With': 'XMLHttpRequest',
+            })
+
+        if response.get('success') == '1':
+            self._logged_in = True
+            return
+
+        message = response.get('message')
+        if message is not None:
+            raise ExtractorError(
+                'Unable to login: %s' % message, expected=True)
+
+        raise ExtractorError('Unable to log in')
+
 
 class PornHubIE(PornHubBaseIE):
     IE_DESC = 'PornHub and Thumbzilla'
     _VALID_URL = r'''(?x)
                     https?://
                         (?:
-                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+                            (?:[^/]+\.)?
+                            %s
+                            /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                             (?:www\.)?thumbzilla\.com/video/
                         )
                         (?P<id>[\da-z]+)
-                    '''
+                    ''' % PornHubBaseIE._PORNHUB_HOST_RE
     _TESTS = [{
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': 'a6391306d050e4547f62b3f485dd9ba9',
@@ -103,6 +170,7 @@ class PornHubIE(PornHubBaseIE):
         'params': {
             'skip_download': True,
         },
+        'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
     }, {
         # subtitles
         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
@@ -163,12 +231,27 @@ class PornHubIE(PornHubBaseIE):
     }, {
         'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
         'only_matching': True,
+    }, {
+        # Some videos are available with the same id on both premium
+        # and non-premium sites (e.g. this and the following test)
+        'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
+        'only_matching': True,
+    }, {
+        # geo restricted
+        'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
+        'only_matching': True,
+    }, {
+        'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
+        'only_matching': True,
     }]
 
     @staticmethod
     def _extract_urls(webpage):
         return re.findall(
-            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)',
+            r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)',
             webpage)
 
     def _extract_count(self, pattern, webpage, name):
@@ -180,12 +263,7 @@ class PornHubIE(PornHubBaseIE):
         host = mobj.group('host') or 'pornhub.com'
         video_id = mobj.group('id')
 
-        if 'premium' in host:
-            if not self._downloader.params.get('cookiefile'):
-                raise ExtractorError(
-                    'PornHub Premium requires authentication.'
-                    ' You may want to use --cookies.',
-                    expected=True)
+        self._login(host)
 
         self._set_cookie(host, 'age_verified', '1')
 
@@ -198,7 +276,8 @@ class PornHubIE(PornHubBaseIE):
         webpage = dl_webpage('pc')
 
         error_msg = self._html_search_regex(
-            r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+            (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+             r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
             webpage, 'error message', default=None, group='error')
         if error_msg:
             error_msg = re.sub(r'\s+', ' ', error_msg)
@@ -206,6 +285,11 @@ class PornHubIE(PornHubBaseIE):
                 'PornHub said: %s' % error_msg,
                 expected=True, video_id=video_id)
 
+        if any(re.search(p, webpage) for p in (
+                r'class=["\']geoBlocked["\']',
+                r'>\s*This content is unavailable in your country')):
+            self.raise_geo_restricted()
+
         # video_title from flashvars contains whitespace instead of non-ASCII (see
         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
         # on that anymore.
@@ -327,35 +411,49 @@ class PornHubIE(PornHubBaseIE):
 
         upload_date = None
         formats = []
-        for video_url, height in video_urls:
-            if not upload_date:
-                upload_date = self._search_regex(
-                    r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
-                if upload_date:
-                    upload_date = upload_date.replace('/', '')
-            ext = determine_ext(video_url)
+
+        def add_format(format_url, height=None):
+            ext = determine_ext(format_url)
             if ext == 'mpd':
                 formats.extend(self._extract_mpd_formats(
-                    video_url, video_id, mpd_id='dash', fatal=False))
-                continue
-            elif ext == 'm3u8':
+                    format_url, video_id, mpd_id='dash', fatal=False))
+                return
+            if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
                     m3u8_id='hls', fatal=False))
-                continue
-            tbr = None
-            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
-            if mobj:
-                if not height:
-                    height = int(mobj.group('height'))
-                tbr = int(mobj.group('tbr'))
+                return
+            if not height:
+                height = int_or_none(self._search_regex(
+                    r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
+                    default=None))
             formats.append({
-                'url': video_url,
+                'url': format_url,
                 'format_id': '%dp' % height if height else None,
                 'height': height,
-                'tbr': tbr,
             })
-        self._sort_formats(formats)
+
+        for video_url, height in video_urls:
+            if not upload_date:
+                upload_date = self._search_regex(
+                    r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
+                if upload_date:
+                    upload_date = upload_date.replace('/', '')
+            if '/video/get_media' in video_url:
+                medias = self._download_json(video_url, video_id, fatal=False)
+                if isinstance(medias, list):
+                    for media in medias:
+                        if not isinstance(media, dict):
+                            continue
+                        video_url = url_or_none(media.get('videoUrl'))
+                        if not video_url:
+                            continue
+                        height = int_or_none(media.get('quality'))
+                        add_format(video_url, height)
+                continue
+            add_format(video_url)
+        self._sort_formats(
+            formats, field_preference=('height', 'width', 'fps', 'format_id'))
 
         video_uploader = self._html_search_regex(
             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@@ -405,6 +503,10 @@ class PornHubIE(PornHubBaseIE):
 
 
 class PornHubPlaylistBaseIE(PornHubBaseIE):
+    def _extract_page(self, url):
+        return int_or_none(self._search_regex(
+            r'\bpage=(\d+)', url, 'page', default=None))
+
     def _extract_entries(self, webpage, host):
         # Only process container div with main playlist content skipping
         # drop-down menu that uses similar pattern for videos (see
@@ -422,29 +524,9 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):
                 container))
         ]
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        host = mobj.group('host')
-        playlist_id = mobj.group('id')
-
-        webpage = self._download_webpage(url, playlist_id)
-
-        entries = self._extract_entries(webpage, host)
-
-        playlist = self._parse_json(
-            self._search_regex(
-                r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
-                'playlist', default='{}'),
-            playlist_id, fatal=False)
-        title = playlist.get('title') or self._search_regex(
-            r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
-
-        return self.playlist_result(
-            entries, playlist_id, title, playlist.get('description'))
-
 
 class PornHubUserIE(PornHubPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
     _TESTS = [{
         'url': 'https://www.pornhub.com/model/zoe_ph',
         'playlist_mincount': 118,
@@ -463,14 +545,30 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
     }, {
         'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
         'only_matching': True,
+    }, {
+        # Unavailable via /videos page, but available with direct pagination
+        # on pornstar page (see [1]), requires premium
+        # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
+        'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
+        'only_matching': True,
+    }, {
+        # Same as before, multi page
+        'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
+        'only_matching': True,
+    }, {
+        'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         user_id = mobj.group('id')
+        videos_url = '%s/videos' % mobj.group('url')
+        page = self._extract_page(url)
+        if page:
+            videos_url = update_url_query(videos_url, {'page': page})
         return self.url_result(
-            '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
-            video_id=user_id)
+            videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
 
 
 class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
@@ -483,36 +581,59 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
                 <button[^>]+\bid=["\']moreDataBtn
             ''', webpage) is not None
 
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        host = mobj.group('host')
-        item_id = mobj.group('id')
+    def _entries(self, url, host, item_id):
+        page = self._extract_page(url)
 
-        page = int_or_none(self._search_regex(
-            r'\bpage=(\d+)', url, 'page', default=None))
+        VIDEOS = '/videos'
 
-        entries = []
-        for page_num in (page, ) if page is not None else itertools.count(1):
+        def download_page(base_url, num, fallback=False):
+            note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '')
+            return self._download_webpage(
+                base_url, item_id, note, query={'page': num})
+
+        def is_404(e):
+            return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
+
+        base_url = url
+        has_page = page is not None
+        first_page = page if has_page else 1
+        for page_num in (first_page, ) if has_page else itertools.count(first_page):
             try:
-                webpage = self._download_webpage(
-                    url, item_id, 'Downloading page %d' % page_num,
-                    query={'page': page_num})
+                try:
+                    webpage = download_page(base_url, page_num)
+                except ExtractorError as e:
+                    # Some sources may not be available via /videos page,
+                    # trying to fallback to main page pagination (see [1])
+                    # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
+                    if is_404(e) and page_num == first_page and VIDEOS in base_url:
+                        base_url = base_url.replace(VIDEOS, '')
+                        webpage = download_page(base_url, page_num, fallback=True)
+                    else:
+                        raise
             except ExtractorError as e:
-                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+                if is_404(e) and page_num != first_page:
                     break
                 raise
             page_entries = self._extract_entries(webpage, host)
             if not page_entries:
                 break
-            entries.extend(page_entries)
+            for e in page_entries:
+                yield e
             if not self._has_more(webpage):
                 break
 
-        return self.playlist_result(orderedSet(entries), item_id)
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        host = mobj.group('host')
+        item_id = mobj.group('id')
+
+        self._login(host)
+
+        return self.playlist_result(self._entries(url, host, item_id), item_id)
 
 
 class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+    _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
     _TESTS = [{
         'url': 'https://www.pornhub.com/model/zoe_ph/videos',
         'only_matching': True,
@@ -617,6 +738,9 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
     }, {
         'url': 'https://de.pornhub.com/playlist/4667351',
         'only_matching': True,
+    }, {
+        'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -627,7 +751,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
 
 
 class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
     _TESTS = [{
         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
         'info_dict': {
@@ -637,4 +761,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
     }, {
         'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
         'only_matching': True,
+    }, {
+        'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
+        'only_matching': True,
     }]
author	pukkandan <pukkandan.ytdlp@gmail.com>	2022-01-30 01:07:28 +0530
committer	pukkandan <pukkandan.ytdlp@gmail.com>	2022-01-30 01:07:28 +0530
commit	a3373da70c97d356bd4927eff403abd261dd8f9f (patch)
tree	fb77c8f297d7ca1bd6a6f3e37a4c0a4b12399db5 /youtube_dl/extractor/pornhub.py
parent	2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 (diff)
parent	5014bd67c22b421207b2650d4dc874b95b36dda1 (diff)
download	youtube-dl-a3373da70c97d356bd4927eff403abd261dd8f9f.tar.xz