diff options
| author | Yen Chi Hsuan <yan12125@gmail.com> | 2016-04-16 17:13:22 +0800 | 
|---|---|---|
| committer | Yen Chi Hsuan <yan12125@gmail.com> | 2016-04-16 17:13:22 +0800 | 
| commit | ee94e7e66d8c715f0df29c22642e51cb56c612b9 (patch) | |
| tree | 5f64e203d2c47655cc483f528abaac2ef766cd9c | |
| parent | 759e37c9e661d962c2318d173f9c99264c1e7e2b (diff) | |
[varzesh3] Fix metadata extraction (closes #9197)
| -rw-r--r-- | youtube_dl/extractor/varzesh3.py | 47 | 
1 files changed, 40 insertions, 7 deletions
diff --git a/youtube_dl/extractor/varzesh3.py b/youtube_dl/extractor/varzesh3.py index 9369abaf8..438bb580c 100644 --- a/youtube_dl/extractor/varzesh3.py +++ b/youtube_dl/extractor/varzesh3.py @@ -2,11 +2,19 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..compat import ( +    compat_urllib_parse_urlparse, +    compat_parse_qs, +) +from ..utils import ( +    clean_html, +    remove_start, +)  class Varzesh3IE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?video\.varzesh3\.com/(?:[^/]+/)+(?P<id>[^/]+)/?' -    _TEST = { +    _TESTS = [{          'url': 'http://video.varzesh3.com/germany/bundesliga/5-%D9%88%D8%A7%DA%A9%D9%86%D8%B4-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AF%D8%B1%D9%88%D8%A7%D8%B2%D9%87%E2%80%8C%D8%A8%D8%A7%D9%86%D8%A7%D9%86%D8%9B%D9%87%D9%81%D8%AA%D9%87-26-%D8%A8%D9%88%D9%86%D8%AF%D8%B3/',          'md5': '2a933874cb7dce4366075281eb49e855',          'info_dict': { @@ -15,8 +23,18 @@ class Varzesh3IE(InfoExtractor):              'title': '۵ واکنش برتر دروازهبانان؛هفته ۲۶ بوندسلیگا',              'description': 'فصل ۲۰۱۵-۲۰۱۴',              'thumbnail': 're:^https?://.*\.jpg$', -        } -    } +        }, +        'skip': 'HTTP 404 Error', +    }, { +        'url': 'http://video.varzesh3.com/video/112785/%D8%AF%D9%84%D9%87-%D8%B9%D9%84%DB%8C%D8%9B-%D8%B3%D8%AA%D8%A7%D8%B1%D9%87-%D9%86%D9%88%D8%B8%D9%87%D9%88%D8%B1-%D9%84%DB%8C%DA%AF-%D8%A8%D8%B1%D8%AA%D8%B1-%D8%AC%D8%B2%DB%8C%D8%B1%D9%87', +        'info_dict': { +            'id': '112785', +            'ext': 'mp4', +            'title': 'دله علی؛ ستاره نوظهور لیگ برتر جزیره', +            'description': 'فوتبال 120', +        }, +        'expected_warnings': ['description'], +    }]      def _real_extract(self, url):          display_id = self._match_id(url) @@ -26,15 +44,30 @@ class Varzesh3IE(InfoExtractor):          video_url = self._search_regex(              r'<source[^>]+src="([^"]+)"', webpage, 'video url') -        title = self._og_search_title(webpage) +        title = remove_start(self._html_search_regex( +            r'<title>([^<]+)</title>', webpage, 'title'), 'ویدیو ورزش 3 | ') +          description = self._html_search_regex(              r'(?s)<div class="matn">(.+?)</div>', -            webpage, 'description', fatal=False) -        thumbnail = self._og_search_thumbnail(webpage) +            webpage, 'description', default=None) +        if description is None: +            description = clean_html(self._html_search_meta('description', webpage)) + +        thumbnail = self._og_search_thumbnail(webpage, default=None) +        if thumbnail is None: +            fb_sharer_url = self._search_regex( +                r'<a[^>]+href="(https?://www\.facebook\.com/sharer/sharer\.php?[^"]+)"', +                webpage, 'facebook sharer URL', fatal=False) +            sharer_params = compat_parse_qs(compat_urllib_parse_urlparse(fb_sharer_url).query) +            thumbnail = sharer_params.get('p[images][0]', [None])[0]          video_id = self._search_regex(              r"<link[^>]+rel='(?:canonical|shortlink)'[^>]+href='/\?p=([^']+)'", -            webpage, display_id, default=display_id) +            webpage, display_id, default=None) +        if video_id is None: +            video_id = self._search_regex( +                'var\s+VideoId\s*=\s*(\d+);', webpage, 'video id', +                default=display_id)          return {              'url': video_url,  | 
