diff options
author | Sergey M․ <dstftw@gmail.com> | 2014-03-22 21:29:01 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2014-03-22 21:30:22 +0700 |
commit | 0320ddc19251eb70e5351edf4788d2979872161c (patch) | |
tree | 4ee85c39337d17d99ad01c19408b5570dc92dc6d /youtube_dl | |
parent | 56dd55721c9fd818f1cfece305eda7910de39430 (diff) |
[pornhub] Fix uploader extraction and extract counts
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 21 |
1 files changed, 20 insertions, 1 deletions
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 834fe7266..7dd3dca0d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -8,6 +8,7 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, + str_to_int, ) from ..aes import ( aes_decrypt_text @@ -27,6 +28,12 @@ class PornHubIE(InfoExtractor): } } + def _extract_count(self, pattern, webpage, name): + count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False) + if count: + count = str_to_int(count) + return count + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -37,11 +44,19 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title') - video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, 'uploader', fatal=False) + video_uploader = self._html_search_regex( + r'(?s)<div class="video-info-row">\s*From: .+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<', + webpage, 'uploader', fatal=False) thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) if thumbnail: thumbnail = compat_urllib_parse.unquote(thumbnail) + view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view') + like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like') + dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike') + comment_count = self._extract_count( + r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment') + video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password')) @@ -77,6 +92,10 @@ class PornHubIE(InfoExtractor): 'uploader': video_uploader, 'title': video_title, 'thumbnail': thumbnail, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, } |