diff options
author | Sergey M․ <dstftw@gmail.com> | 2019-01-23 03:51:29 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2019-01-23 03:51:29 +0700 |
commit | 278d061a0c5eae20963c0a6df4b9b13fd1537186 (patch) | |
tree | c85679efac405ec39ea5944993fce07c67ad4c6d | |
parent | 503b604a316837b9dd6ef32045e4e9bbfb6a1363 (diff) |
[pornhub] Bypass scrape detection (closes #5930)
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e377de196..f5f3e6593 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -10,7 +10,9 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, + compat_urllib_request, ) +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, int_or_none, @@ -126,6 +128,26 @@ class PornHubIE(InfoExtractor): 'only_matching': True, }] + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r'<body\b[^>]+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + @staticmethod def _extract_urls(webpage): return re.findall( |