diff options
author | Throaway <Throaway@null.com> | 2017-03-20 16:29:39 -0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2017-03-22 01:51:45 +0700 |
commit | 21fbf0f955f584ad2d02608850a69a2fd74b65a6 (patch) | |
tree | 71e5b685374332428863568aff8d6e4e29031cc6 | |
parent | 97952bdb78854bf09c688eb535dc7b67265934c1 (diff) |
[pornhub] Decode obfuscated video URL (closes #12470)
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 37 |
1 files changed, 34 insertions, 3 deletions
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9b413590a..eb316ad14 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import operator # import os import re @@ -129,9 +131,38 @@ class PornHubIE(InfoExtractor): tv_webpage = dl_webpage('tv') - video_url = self._search_regex( - r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage, - 'video url', group='url') + encoded_url = self._search_regex(r'(var.*mediastring.*)</script>', + tv_webpage, 'encoded url') + assignments = encoded_url.split(";") + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*[^*]*\*/', "", inp) + + if "+" in inp: + inps = inp.split("+") + return functools.reduce(operator.concat, map(parse_js_value, inps)) + + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + + # Hope it's a string! + assert inp.startswith('"') and inp.endswith('"') + return inp[1:-1] + + for assn in assignments: + assn = assn.strip() + if len(assn) == 0: + continue + + assert assn.startswith("var ") + assn = assn[4:] + vname, value = assn.split("=", 1) + + js_vars[vname] = parse_js_value(value) + + video_url = js_vars["mediastring"] title = self._search_regex( r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) |