diff options
author | Sergey M․ <dstftw@gmail.com> | 2014-08-31 06:43:36 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2014-08-31 06:43:36 +0700 |
commit | 7b53af7f70da81eae41da645cc5af2c777c5c8e5 (patch) | |
tree | ce3b993bb50b34b0211975bc9812bcf08b4b0dce /youtube_dl/extractor | |
parent | ca7b3246b69215c890193acaf4eab746bc19504e (diff) |
[vporn] Fix issues, extract all formats and metadata
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/vporn.py | 91 |
1 files changed, 69 insertions, 22 deletions
diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index 645e935ec..426369c51 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -1,52 +1,99 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + parse_duration, + str_to_int, +) + class VpornIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?vporn\.com/[a-z]+/(?P<title_dash>[a-z-]+)/(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?vporn\.com/[^/]+/(?P<display_id>[^/]+)/(?P<id>\d+)' _TEST = { 'url': 'http://www.vporn.com/masturbation/violet-on-her-th-birthday/497944/', 'md5': 'facf37c1b86546fa0208058546842c55', 'info_dict': { 'id': '497944', + 'display_id': 'violet-on-her-th-birthday', 'ext': 'mp4', - 'title': 'Violet On Her 19th Birthday', + 'title': 'Violet on her 19th birthday', 'description': 'Violet dances in front of the camera which is sure to get you horny.', - 'duration': 393, 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'kileyGrope', + 'categories': ['Masturbation', 'Teen'], + 'duration': 393, + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'videoname\s*=\s*\'([^\']+)\'', webpage, 'title').strip() + description = self._html_search_regex( + r'<div class="description_txt">(.*?)</div>', webpage, 'description', fatal=False) + thumbnail = self._html_search_regex( + r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None) + if thumbnail: + thumbnail = 'http://www.vporn.com' + thumbnail + + uploader = self._html_search_regex( + r'(?s)UPLOADED BY.*?<a href="/user/[^"]+">([^<]+)</a>', + webpage, 'uploader', fatal=False) - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(.*?) - Vporn Video</title>', webpage, 'title') - video_url = self._html_search_regex(r'flashvars.videoUrlMedium = "(.*?)"', webpage, 'video_url') - description = self._html_search_regex(r'<div class="description_txt">(.*?)</div>', webpage, 'description') - thumbnail = 'http://www.vporn.com' + self._html_search_regex(r'flashvars.imageUrl = "(.*?)"', webpage, 'description') + categories = re.findall(r'<a href="/cat/[^"]+">([^<]+)</a>', webpage) - mobj = re.search(r'<span class="f_right">duration (?P<minutes>\d+) min (?P<seconds>\d+) sec </span>', webpage) - duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + duration = parse_duration(self._search_regex( + r'duration (\d+ min \d+ sec)', webpage, 'duration', fatal=False)) - mobj = re.search(r'<span>((?P<thousands>\d+),)?(?P<units>\d+) VIEWS</span>', webpage) - try: - view_count = int(mobj.group('units')) - view_count += int(mobj.group('thousands')) * 1000 - except: - pass + view_count = str_to_int(self._html_search_regex( + r'<span>([\d,\.]+) VIEWS</span>', webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + r'<span id="like" class="n">([\d,\.]+)</span>', webpage, 'like count', fatal=False)) + dislike_count = str_to_int(self._html_search_regex( + r'<span id="dislike" class="n">([\d,\.]+)</span>', webpage, 'dislike count', fatal=False)) + comment_count = str_to_int(self._html_search_regex( + r'<h4>Comments \(<b>([\d,\.]+)</b>\)</h4>', webpage, 'comment count', fatal=False)) + + formats = [] + + for video in re.findall(r'flashvars\.videoUrl([^=]+?)\s*=\s*"([^"]+)"', webpage): + video_url = video[1] + fmt = { + 'url': video_url, + 'format_id': video[0], + } + m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)_(?P<vbr>\d+)k\.mp4$', video_url) + if m: + fmt.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'vbr': int(m.group('vbr')), + }) + formats.append(fmt) + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, - 'thumbnail': thumbnail, + 'display_id': display_id, 'title': title, 'description': description, - 'duration': int_or_none(duration), - 'view_count': int_or_none(view_count), + 'thumbnail': thumbnail, + 'uploader': uploader, + 'categories': categories, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'comment_count': comment_count, + 'age_limit': 18, + 'formats': formats, } |