diff options
| -rw-r--r-- | youtube_dl/extractor/extractors.py | 2 | ||||
| -rw-r--r-- | youtube_dl/extractor/pear.py | 34 | ||||
| -rw-r--r-- | youtube_dl/extractor/pearvideo.py | 63 | 
3 files changed, 64 insertions, 35 deletions
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 75c1a3d0e..28f0d3f0d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -762,7 +762,7 @@ from .pandoratv import PandoraTVIE  from .parliamentliveuk import ParliamentLiveUKIE  from .patreon import PatreonIE  from .pbs import PBSIE -from .pear import PearIE +from .pearvideo import PearVideoIE  from .people import PeopleIE  from .periscope import (      PeriscopeIE, diff --git a/youtube_dl/extractor/pear.py b/youtube_dl/extractor/pear.py deleted file mode 100644 index 77fd46852..000000000 --- a/youtube_dl/extractor/pear.py +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class PearIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>[0-9]+)' -    _TEST = { -        'url': 'http://www.pearvideo.com/video_1076290', -        'info_dict': { -            'id': '1076290', -            'ext': 'mp4', -            'title': '小浣熊在主人家玻璃上滚石头:没砸', -            'description': '小浣熊找到一个小石头,仿佛发现了一个宝贝。它不停地用石头按在玻璃上,滚来滚去,吸引主人注意。', -            'url': 'http://video.pearvideo.com/mp4/short/20170508/cont-1076290-10438018-hd.mp4' -        } -    } - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        title = self._html_search_regex(r'<h1[^>]+class="video-tt">(.+)</h1>', webpage, 'title', fatal=False) -        description = self._html_search_regex(r'<div[^>]+class="summary"[^>]*>([^<]+)<', webpage, 'description', fatal=False) -        url = self._html_search_regex(r'hdUrl="(.*?)"', webpage, 'url', fatal=False) - -        return { -            'id': video_id, -            'ext': 'mp4', -            'title': title, -            'description': description, -            'url': url -        } diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py new file mode 100644 index 000000000..1d777221c --- /dev/null +++ b/youtube_dl/extractor/pearvideo.py @@ -0,0 +1,63 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    qualities, +    unified_timestamp, +) + + +class PearVideoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.pearvideo.com/video_1076290', +        'info_dict': { +            'id': '1076290', +            'ext': 'mp4', +            'title': '小浣熊在主人家玻璃上滚石头:没砸', +            'description': 'md5:01d576b747de71be0ee85eb7cac25f9d', +            'timestamp': 1494275280, +            'upload_date': '20170508', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        quality = qualities( +            ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src')) + +        formats = [{ +            'url': mobj.group('url'), +            'format_id': mobj.group('id'), +            'quality': quality(mobj.group('id')), +        } for mobj in re.finditer( +            r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2', +            webpage)] +        self._sort_formats(formats) + +        title = self._search_regex( +            (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)', +             r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'), +            webpage, 'title', group='value') +        description = self._search_regex( +            (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)', +             r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'), +            webpage, 'description', default=None, +            group='value') or self._html_search_meta('Description', webpage) +        timestamp = unified_timestamp(self._search_regex( +            r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)', +            webpage, 'timestamp', fatal=False)) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'timestamp': timestamp, +            'formats': formats, +        }  | 
