diff options
Diffstat (limited to 'youtube_dl/extractor/xtube.py')
| -rw-r--r-- | youtube_dl/extractor/xtube.py | 113 | 
1 files changed, 94 insertions, 19 deletions
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 5584674a0..7246409e3 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -6,10 +6,12 @@ import re  from .common import InfoExtractor  from ..utils import (      int_or_none, +    js_to_json,      orderedSet,      parse_duration,      sanitized_Request,      str_to_int, +    url_or_none,  ) @@ -17,7 +19,7 @@ class XTubeIE(InfoExtractor):      _VALID_URL = r'''(?x)                          (?:                              xtube:| -                            https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-) +                            https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?:embedded/)?(?P<display_id>[^/]+)-)                          )                          (?P<id>[^/?&#]+)                      ''' @@ -38,6 +40,22 @@ class XTubeIE(InfoExtractor):              'age_limit': 18,          }      }, { +        # FLV videos with duplicated formats +        'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', +        'md5': 'a406963eb349dd43692ec54631efd88b', +        'info_dict': { +            'id': '9299752', +            'display_id': 'A-Super-Run-Part-1-YT', +            'ext': 'flv', +            'title': 'A Super Run - Part 1 (YT)', +            'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', +            'uploader': 'tshirtguy59', +            'duration': 579, +            'view_count': int, +            'comment_count': int, +            'age_limit': 18, +        }, +    }, {          # new URL schema          'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',          'only_matching': True, @@ -47,6 +65,9 @@ class XTubeIE(InfoExtractor):      }, {          'url': 'xtube:kVTUy_G222_',          'only_matching': True, +    }, { +        'url': 'https://www.xtube.com/video-watch/embedded/milf-tara-and-teen-shared-and-cum-covered-extreme-bukkake-32203482?embedsize=big', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -67,33 +88,86 @@ class XTubeIE(InfoExtractor):                  'Cookie': 'age_verified=1; cookiesAccepted=1',              }) -        sources = self._parse_json(self._search_regex( -            r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),', -            webpage, 'sources', group='sources'), video_id) +        title, thumbnail, duration, sources, media_definition = [None] * 5 + +        config = self._parse_json(self._search_regex( +            r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config', +            default='{}'), video_id, transform_source=js_to_json, fatal=False) +        if config: +            config = config.get('mainRoll') +            if isinstance(config, dict): +                title = config.get('title') +                thumbnail = config.get('poster') +                duration = int_or_none(config.get('duration')) +                sources = config.get('sources') or config.get('format') +                media_definition = config.get('mediaDefinition') + +        if not isinstance(sources, dict) and not media_definition: +            sources = self._parse_json(self._search_regex( +                r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', +                webpage, 'sources', group='sources'), video_id, +                transform_source=js_to_json)          formats = [] -        for format_id, format_url in sources.items(): -            formats.append({ -                'url': format_url, -                'format_id': format_id, -                'height': int_or_none(format_id), -            }) +        format_urls = set() + +        if isinstance(sources, dict): +            for format_id, format_url in sources.items(): +                format_url = url_or_none(format_url) +                if not format_url: +                    continue +                if format_url in format_urls: +                    continue +                format_urls.add(format_url) +                formats.append({ +                    'url': format_url, +                    'format_id': format_id, +                    'height': int_or_none(format_id), +                }) + +        if isinstance(media_definition, list): +            for media in media_definition: +                video_url = url_or_none(media.get('videoUrl')) +                if not video_url: +                    continue +                if video_url in format_urls: +                    continue +                format_urls.add(video_url) +                format_id = media.get('format') +                if format_id == 'hls': +                    formats.extend(self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        m3u8_id='hls', fatal=False)) +                elif format_id == 'mp4': +                    height = int_or_none(media.get('quality')) +                    formats.append({ +                        'url': video_url, +                        'format_id': '%s-%d' % (format_id, height) if height else format_id, +                        'height': height, +                    }) + +        self._remove_duplicate_formats(formats)          self._sort_formats(formats) -        title = self._search_regex( -            (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), -            webpage, 'title', group='title') -        description = self._search_regex( +        if not title: +            title = self._search_regex( +                (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), +                webpage, 'title', group='title') +        description = self._og_search_description( +            webpage, default=None) or self._html_search_meta( +            'twitter:description', webpage, default=None) or self._search_regex(              r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)          uploader = self._search_regex(              (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',               r'<span[^>]+class="nickname"[^>]*>([^<]+)'),              webpage, 'uploader', fatal=False) -        duration = parse_duration(self._search_regex( -            r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', -            webpage, 'duration', fatal=False)) +        if not duration: +            duration = parse_duration(self._search_regex( +                r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>', +                webpage, 'duration', fatal=False))          view_count = str_to_int(self._search_regex( -            r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>', +            (r'["\']viewsCount["\'][^>]*>(\d+)\s+views', +             r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'),              webpage, 'view count', fatal=False))          comment_count = str_to_int(self._html_search_regex(              r'>Comments? \(([\d,\.]+)\)<', @@ -104,6 +178,7 @@ class XTubeIE(InfoExtractor):              'display_id': display_id,              'title': title,              'description': description, +            'thumbnail': thumbnail,              'uploader': uploader,              'duration': duration,              'view_count': view_count, @@ -122,7 +197,7 @@ class XTubeUserIE(InfoExtractor):              'id': 'greenshowers-4056496',              'age_limit': 18,          }, -        'playlist_mincount': 155, +        'playlist_mincount': 154,      }      def _real_extract(self, url):  | 
