diff options
| author | Sergey M․ <dstftw@gmail.com> | 2019-01-23 04:12:06 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2019-01-23 04:12:06 +0700 | 
| commit | 71a1f61700789fb0d61fc6ad9681b6f0899d2f51 (patch) | |
| tree | 0b63ed4378dfeff18effd359a13a509ad49f8a84 | |
| parent | 6510a3aa971c00525969040ad654249c0c73f125 (diff) | |
[pornhub] Apply scrape detection bypass for all extractors
| -rw-r--r-- | youtube_dl/extractor/pornhub.py | 46 | 
1 files changed, 24 insertions, 22 deletions
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index f5f3e6593..be93d5d48 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -24,7 +24,29 @@ from ..utils import (  ) -class PornHubIE(InfoExtractor): +class PornHubBaseIE(InfoExtractor): +    def _download_webpage_handle(self, *args, **kwargs): +        def dl(*args, **kwargs): +            return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + +        webpage, urlh = dl(*args, **kwargs) + +        if any(re.search(p, webpage) for p in ( +                r'<body\b[^>]+\bonload=["\']go\(\)', +                r'document\.cookie\s*=\s*["\']RNKEY=', +                r'document\.location\.reload\(true\)')): +            url_or_request = args[0] +            url = (url_or_request.get_full_url() +                   if isinstance(url_or_request, compat_urllib_request.Request) +                   else url_or_request) +            phantom = PhantomJSwrapper(self, required_version='2.0') +            phantom.get(url, html=webpage) +            webpage, urlh = dl(*args, **kwargs) + +        return webpage, urlh + + +class PornHubIE(PornHubBaseIE):      IE_DESC = 'PornHub and Thumbzilla'      _VALID_URL = r'''(?x)                      https?:// @@ -128,26 +150,6 @@ class PornHubIE(InfoExtractor):          'only_matching': True,      }] -    def _download_webpage_handle(self, *args, **kwargs): -        def dl(*args, **kwargs): -            return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs) - -        webpage, urlh = dl(*args, **kwargs) - -        if any(re.search(p, webpage) for p in ( -                r'<body\b[^>]+\bonload=["\']go\(\)', -                r'document\.cookie\s*=\s*["\']RNKEY=', -                r'document\.location\.reload\(true\)')): -            url_or_request = args[0] -            url = (url_or_request.get_full_url() -                   if isinstance(url_or_request, compat_urllib_request.Request) -                   else url_or_request) -            phantom = PhantomJSwrapper(self, required_version='2.0') -            phantom.get(url, html=webpage) -            webpage, urlh = dl(*args, **kwargs) - -        return webpage, urlh -      @staticmethod      def _extract_urls(webpage):          return re.findall( @@ -329,7 +331,7 @@ class PornHubIE(InfoExtractor):          } -class PornHubPlaylistBaseIE(InfoExtractor): +class PornHubPlaylistBaseIE(PornHubBaseIE):      def _extract_entries(self, webpage, host):          # Only process container div with main playlist content skipping          # drop-down menu that uses similar pattern for videos (see  | 
