diff options
| author | Sergey M․ <dstftw@gmail.com> | 2017-07-30 15:48:22 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-07-30 15:48:22 +0700 | 
| commit | a0a477b885dc1dd688058924357c4935f3c935cf (patch) | |
| tree | d2d5896c06bbdbf37ca12d814ee5b77bf74b8e58 | |
| parent | 198d4cb40ce9d819e8e4079058642ee96dae213b (diff) | |
[youjizz] Fix extraction (closes #13744)
| -rw-r--r-- | youtube_dl/extractor/youjizz.py | 78 | 
1 files changed, 67 insertions, 11 deletions
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index b50f34e9b..f33fabe19 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,39 +1,95 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    int_or_none, +    parse_duration, +)  class YouJizzIE(InfoExtractor): -    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' +    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'      _TESTS = [{          'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', -        'md5': '78fc1901148284c69af12640e01c6310', +        'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',          'info_dict': {              'id': '2189178',              'ext': 'mp4',              'title': 'Zeichentrick 1',              'age_limit': 18, +            'duration': 2874,          }      }, {          'url': 'http://www.youjizz.com/videos/-2189178.html',          'only_matching': True, +    }, { +        'url': 'https://www.youjizz.com/videos/embed/31991001', +        'only_matching': True,      }]      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') or mobj.group('embed_id') +          webpage = self._download_webpage(url, video_id) -        # YouJizz's HTML5 player has invalid HTML -        webpage = webpage.replace('"controls', '" controls') -        age_limit = self._rta_search(webpage) -        video_title = self._html_search_regex( -            r'<title>\s*(.*)\s*</title>', webpage, 'title') -        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] +        title = self._html_search_regex( +            r'<title>(.+?)</title>', webpage, 'title') + +        formats = [] + +        encodings = self._parse_json( +            self._search_regex( +                r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings', +                default='[]'), +            video_id, fatal=False) +        for encoding in encodings: +            if not isinstance(encoding, dict): +                continue +            format_url = encoding.get('filename') +            if not isinstance(format_url, compat_str): +                continue +            if determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            else: +                format_id = encoding.get('name') or encoding.get('quality') +                height = int_or_none(self._search_regex( +                    r'^(\d+)[pP]', format_id, 'height', default=None)) +                formats.append({ +                    'url': format_url, +                    'format_id': format_id, +                    'height': height, +                }) + +        if formats: +            info_dict = { +                'formats': formats, +            } +        else: +            # YouJizz's HTML5 player has invalid HTML +            webpage = webpage.replace('"controls', '" controls') +            info_dict = self._parse_html5_media_entries( +                url, webpage, video_id)[0] + +        duration = parse_duration(self._search_regex( +            r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration', +            default=None)) +        uploader = self._search_regex( +            r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader', +            default=None)          info_dict.update({              'id': video_id, -            'title': video_title, -            'age_limit': age_limit, +            'title': title, +            'age_limit': self._rta_search(webpage), +            'duration': duration, +            'uploader': uploader,          })          return info_dict  | 
