diff options
| author | Sergey M․ <dstftw@gmail.com> | 2017-02-14 23:52:41 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-02-14 23:52:41 +0700 | 
| commit | 9a372f14b422de15acf91e25a90375688b2ba3fa (patch) | |
| tree | 838169cd7309ba196a2c17c43dcc9b9bc255ee5e | |
| parent | 5cb2d36c82abf3b753910afe3013b274e31a247a (diff) | |
[pornhub] Extract video URL from tv platform site (#12007, #12129)
| -rw-r--r-- | youtube_dl/extractor/pornhub.py | 44 | 
1 files changed, 30 insertions, 14 deletions
| diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 818d99c1f..7a2737032 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -2,27 +2,27 @@  from __future__ import unicode_literals  import itertools -import os +# import os  import re  from .common import InfoExtractor  from ..compat import (      compat_HTTPError, -    compat_urllib_parse_unquote, -    compat_urllib_parse_unquote_plus, -    compat_urllib_parse_urlparse, +    # compat_urllib_parse_unquote, +    # compat_urllib_parse_unquote_plus, +    # compat_urllib_parse_urlparse,  )  from ..utils import (      ExtractorError,      int_or_none,      js_to_json,      orderedSet, -    sanitized_Request, +    # sanitized_Request,      str_to_int,  ) -from ..aes import ( -    aes_decrypt_text -) +# from ..aes import ( +#     aes_decrypt_text +# )  class PornHubIE(InfoExtractor): @@ -109,10 +109,14 @@ class PornHubIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        req = sanitized_Request( -            'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) -        req.add_header('Cookie', 'age_verified=1') -        webpage = self._download_webpage(req, video_id) +        def dl_webpage(platform): +            return self._download_webpage( +                'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, +                video_id, headers={ +                    'Cookie': 'age_verified=1; platform=%s' % platform, +                }) + +        webpage = dl_webpage('pc')          error_msg = self._html_search_regex(              r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', @@ -123,10 +127,19 @@ class PornHubIE(InfoExtractor):                  'PornHub said: %s' % error_msg,                  expected=True, video_id=video_id) +        tv_webpage = dl_webpage('tv') + +        video_url = self._search_regex( +            r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage, +            'video url', group='url') + +        title = self._search_regex( +            r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) +          # video_title from flashvars contains whitespace instead of non-ASCII (see          # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying          # on that anymore. -        title = self._html_search_meta( +        title = title or self._html_search_meta(              'twitter:title', webpage, default=None) or self._search_regex(              (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',               r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', @@ -156,6 +169,7 @@ class PornHubIE(InfoExtractor):          comment_count = self._extract_count(              r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') +        """          video_variables = {}          for video_variablename, quote, video_variable in re.findall(                  r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage): @@ -197,6 +211,7 @@ class PornHubIE(InfoExtractor):                  'height': height,              })          self._sort_formats(formats) +        """          page_params = self._parse_json(self._search_regex(              r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})', @@ -209,6 +224,7 @@ class PornHubIE(InfoExtractor):          return {              'id': video_id, +            'url': video_url,              'uploader': video_uploader,              'title': title,              'thumbnail': thumbnail, @@ -217,7 +233,7 @@ class PornHubIE(InfoExtractor):              'like_count': like_count,              'dislike_count': dislike_count,              'comment_count': comment_count, -            'formats': formats, +            # 'formats': formats,              'age_limit': 18,              'tags': tags,              'categories': categories, | 
