[pornhub] Fix extraction and add support for m3u8 formats (closes #22749, closes #23082)

author: Sergey M․ <dstftw@gmail.com> 2019-12-31 23:29:06 +0700
committer: Sergey M․ <dstftw@gmail.com> 2019-12-31 23:29:06 +0700
commit: f41347260c2c2cf723bc2bb8a5c11f67a22175d5 (patch)
tree: 03ab47e00b46b1cdef117870fb1e693efa353349
parent: 060680874654e77cfd03d150a834b58213379c8c (diff)
1 files changed, 37 insertions, 10 deletions
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index ba0ad7da2..75ed69cde 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -227,12 +227,13 @@ class PornHubIE(PornHubBaseIE):
         else:
             thumbnail, duration = [None] * 2
 
-        if not video_urls:
-            tv_webpage = dl_webpage('tv')
-
+        def extract_js_vars(webpage, pattern, fatal=True):
             assignments = self._search_regex(
-                r'(var.+?mediastring.+?)</script>', tv_webpage,
-                'encoded url').split(';')
+                pattern, webpage, 'encoded url', fatal=fatal)
+            if not assignments:
+                return {}
+
+            assignments = assignments.split(';')
 
             js_vars = {}
 
@@ -254,11 +255,31 @@ class PornHubIE(PornHubBaseIE):
                 assn = re.sub(r'var\s+', '', assn)
                 vname, value = assn.split('=', 1)
                 js_vars[vname] = parse_js_value(value)
+            return js_vars
 
-            video_url = js_vars['mediastring']
-            if video_url not in video_urls_set:
-                video_urls.append((video_url, None))
-                video_urls_set.add(video_url)
+        def add_video_url(video_url):
+            v_url = url_or_none(video_url)
+            if not v_url:
+                return
+            if v_url in video_urls_set:
+                return
+            video_urls.append((v_url, None))
+            video_urls_set.add(v_url)
+
+        if not video_urls:
+            FORMAT_PREFIXES = ('media', 'quality')
+            js_vars = extract_js_vars(
+                webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
+                fatal=False)
+            if js_vars:
+                for key, format_url in js_vars.items():
+                    if any(key.startswith(p) for p in FORMAT_PREFIXES):
+                        add_video_url(format_url)
+
+        if not video_urls:
+            js_vars = extract_js_vars(
+                dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
+            add_video_url(js_vars['mediastring'])
 
         for mobj in re.finditer(
                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
@@ -276,10 +297,16 @@ class PornHubIE(PornHubBaseIE):
                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
                 if upload_date:
                     upload_date = upload_date.replace('/', '')
-            if determine_ext(video_url) == 'mpd':
+            ext = determine_ext(video_url)
+            if ext == 'mpd':
                 formats.extend(self._extract_mpd_formats(
                     video_url, video_id, mpd_id='dash', fatal=False))
                 continue
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                continue
             tbr = None
             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
             if mobj:
author	Sergey M․ <dstftw@gmail.com>	2019-12-31 23:29:06 +0700
committer	Sergey M․ <dstftw@gmail.com>	2019-12-31 23:29:06 +0700
commit	f41347260c2c2cf723bc2bb8a5c11f67a22175d5 (patch)
tree	03ab47e00b46b1cdef117870fb1e693efa353349
parent	060680874654e77cfd03d150a834b58213379c8c (diff)