diff options
author | Sergey M․ <dstftw@gmail.com> | 2017-07-16 03:06:04 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2017-07-16 03:06:04 +0700 |
commit | decf86044d17a8ec04e43a4805a0092622d976ae (patch) | |
tree | 1c39fc2aa7f59168b3113d96e43a811bb9d8c21c /youtube_dl/extractor | |
parent | 94b817edebb63c3d8485e1ae27cc394dd9e21f9d (diff) |
[pearvideo] Improve (closes #13031)
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/extractors.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/pear.py | 34 | ||||
-rw-r--r-- | youtube_dl/extractor/pearvideo.py | 63 |
3 files changed, 64 insertions, 35 deletions
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 75c1a3d0e..28f0d3f0d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -762,7 +762,7 @@ from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE -from .pear import PearIE +from .pearvideo import PearVideoIE from .people import PeopleIE from .periscope import ( PeriscopeIE, diff --git a/youtube_dl/extractor/pear.py b/youtube_dl/extractor/pear.py deleted file mode 100644 index 77fd46852..000000000 --- a/youtube_dl/extractor/pear.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class PearIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.pearvideo.com/video_1076290', - 'info_dict': { - 'id': '1076290', - 'ext': 'mp4', - 'title': '小浣熊在主人家玻璃上滚石头:没砸', - 'description': '小浣熊找到一个小石头,仿佛发现了一个宝贝。它不停地用石头按在玻璃上,滚来滚去,吸引主人注意。', - 'url': 'http://video.pearvideo.com/mp4/short/20170508/cont-1076290-10438018-hd.mp4' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex(r'<h1[^>]+class="video-tt">(.+)</h1>', webpage, 'title', fatal=False) - description = self._html_search_regex(r'<div[^>]+class="summary"[^>]*>([^<]+)<', webpage, 'description', fatal=False) - url = self._html_search_regex(r'hdUrl="(.*?)"', webpage, 'url', fatal=False) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'url': url - } diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py new file mode 100644 index 000000000..1d777221c --- /dev/null +++ b/youtube_dl/extractor/pearvideo.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + qualities, + unified_timestamp, +) + + +class PearVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)' + _TEST = { + 'url': 'http://www.pearvideo.com/video_1076290', + 'info_dict': { + 'id': '1076290', + 'ext': 'mp4', + 'title': '小浣熊在主人家玻璃上滚石头:没砸', + 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', + 'timestamp': 1494275280, + 'upload_date': '20170508', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + quality = qualities( + ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + + formats = [{ + 'url': mobj.group('url'), + 'format_id': mobj.group('id'), + 'quality': quality(mobj.group('id')), + } for mobj in re.finditer( + r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2', + webpage)] + self._sort_formats(formats) + + title = self._search_regex( + (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)', + r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'), + webpage, 'title', group='value') + description = self._search_regex( + (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)', + r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'), + webpage, 'description', default=None, + group='value') or self._html_search_meta('Description', webpage) + timestamp = unified_timestamp(self._search_regex( + r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)', + webpage, 'timestamp', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + } |