diff options
| author | Throaway <Throaway@null.com> | 2017-03-20 16:29:39 -0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-03-22 01:51:45 +0700 | 
| commit | 21fbf0f955f584ad2d02608850a69a2fd74b65a6 (patch) | |
| tree | 71e5b685374332428863568aff8d6e4e29031cc6 | |
| parent | 97952bdb78854bf09c688eb535dc7b67265934c1 (diff) | |
[pornhub] Decode obfuscated video URL (closes #12470)
| -rw-r--r-- | youtube_dl/extractor/pornhub.py | 37 | 
1 files changed, 34 insertions, 3 deletions
| diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9b413590a..eb316ad14 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@  # coding: utf-8  from __future__ import unicode_literals +import functools  import itertools +import operator  # import os  import re @@ -129,9 +131,38 @@ class PornHubIE(InfoExtractor):          tv_webpage = dl_webpage('tv') -        video_url = self._search_regex( -            r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage, -            'video url', group='url') +        encoded_url = self._search_regex(r'(var.*mediastring.*)</script>', +            tv_webpage, 'encoded url') +        assignments = encoded_url.split(";") +        js_vars = {} + +        def parse_js_value(inp): +            inp = re.sub(r'/\*[^*]*\*/', "", inp) + +            if "+" in inp: +                inps = inp.split("+") +                return functools.reduce(operator.concat, map(parse_js_value, inps)) + +            inp = inp.strip() +            if inp in js_vars: +                return js_vars[inp] + +            # Hope it's a string! +            assert inp.startswith('"') and inp.endswith('"') +            return inp[1:-1] + +        for assn in assignments: +            assn = assn.strip() +            if len(assn) == 0: +                continue + +            assert assn.startswith("var ") +            assn = assn[4:] +            vname, value = assn.split("=", 1) + +            js_vars[vname] = parse_js_value(value) + +        video_url = js_vars["mediastring"]          title = self._search_regex(              r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) | 
