diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 192 | 
1 files changed, 113 insertions, 79 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1cba40387..7b6179a2a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,23 +14,24 @@ from .common import InfoExtractor, SearchInfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..jsinterp import JSInterpreter  from ..swfinterp import SWFInterpreter -from ..utils import ( +from ..compat import (      compat_chr,      compat_parse_qs,      compat_urllib_parse,      compat_urllib_request,      compat_urlparse,      compat_str, - +) +from ..utils import (      clean_html, -    get_element_by_id, -    get_element_by_attribute,      ExtractorError, +    get_element_by_attribute, +    get_element_by_id,      int_or_none,      OnDemandPagedList, +    orderedSet,      unescapeHTML,      unified_strdate, -    orderedSet,      uppercase_escape,  ) @@ -44,9 +45,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      _LOGIN_REQUIRED = False      def _set_language(self): -        self._set_cookie('.youtube.com', 'PREF', 'f1=50000000&hl=en', +        self._set_cookie( +            '.youtube.com', 'PREF', 'f1=50000000&hl=en',              # YouTube sets the expire time to about two months -            expire_time=time.time() + 60*24*3600) +            expire_time=time.time() + 2 * 30 * 24 * 3600)      def _login(self):          """ @@ -416,6 +418,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'upload_date': '20140605',              },          }, +        # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) +        { +            'url': '__2ABJjxzNo', +            'info_dict': { +                'id': '__2ABJjxzNo', +                'ext': 'mp4', +                'upload_date': '20100430', +                'uploader_id': 'deadmau5', +                'description': 'md5:12c56784b8032162bb936a5f76d55360', +                'uploader': 'deadmau5', +                'title': 'Deadmau5 - Some Chords (HD)', +            }, +            'expected_warnings': [ +                'DASH manifest missing', +            ] +        }, +        # Olympics (https://github.com/rg3/youtube-dl/issues/4431) +        { +            'url': 'lqQg6PlCWgI', +            'info_dict': { +                'id': 'lqQg6PlCWgI', +                'ext': 'mp4', +                'upload_date': '20120731', +                'uploader_id': 'olympic', +                'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', +                'uploader': 'Olympics', +                'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games', +            }, +            'params': { +                'skip_download': 'requires avconv', +            } +        },      ]      def __init__(self, *args, **kwargs): @@ -665,6 +699,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id          return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') +    def _parse_dash_manifest( +            self, video_id, dash_manifest_url, player_url, age_gate): +        def decrypt_sig(mobj): +            s = mobj.group(1) +            dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) +            return '/signature/%s' % dec_s +        dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) +        dash_doc = self._download_xml( +            dash_manifest_url, video_id, +            note='Downloading DASH manifest', +            errnote='Could not download DASH manifest') + +        formats = [] +        for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): +            url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') +            if url_el is None: +                continue +            format_id = r.attrib['id'] +            video_url = url_el.text +            filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) +            f = { +                'format_id': format_id, +                'url': video_url, +                'width': int_or_none(r.attrib.get('width')), +                'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), +                'asr': int_or_none(r.attrib.get('audioSamplingRate')), +                'filesize': filesize, +                'fps': int_or_none(r.attrib.get('frameRate')), +            } +            try: +                existing_format = next( +                    fo for fo in formats +                    if fo['format_id'] == format_id) +            except StopIteration: +                f.update(self._formats.get(format_id, {})) +                formats.append(f) +            else: +                existing_format.update(f) +        return formats +      def _real_extract(self, url):          proto = (              'http' if self._downloader.params.get('prefer_insecure', False) @@ -722,9 +796,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  # We fallback to the get_video_info pages (used by the embed page)                  self.report_video_info_webpage_download(video_id)                  for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: -                    video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' -                        % (video_id, el_type)) -                    video_info_webpage = self._download_webpage(video_info_url, +                    video_info_url = ( +                        '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' +                        % (proto, video_id, el_type)) +                    video_info_webpage = self._download_webpage( +                        video_info_url,                          video_id, note=False,                          errnote='unable to download video info webpage')                      video_info = compat_parse_qs(video_info_webpage) @@ -797,7 +873,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          m_cat_container = self._search_regex(              r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', -            video_webpage, 'categories', fatal=False) +            video_webpage, 'categories', default=None)          if m_cat_container:              category = self._html_search_regex(                  r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', @@ -875,7 +951,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'url': video_info['conn'][0],                  'player_url': player_url,              }] -        elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: +        elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1:              encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]              if 'rtmpe%3Dyes' in encoded_url_map:                  raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) @@ -940,51 +1016,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          # Look for the DASH manifest          if self._downloader.params.get('youtube_include_dash_manifest', True): -            try: -                # The DASH manifest used needs to be the one from the original video_webpage. -                # The one found in get_video_info seems to be using different signatures. -                # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage. -                # Luckily, it seems, this case uses some kind of default signature (len == 86), so the -                # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here. -                dash_manifest_url = video_info.get('dashmpd')[0] - -                def decrypt_sig(mobj): -                    s = mobj.group(1) -                    dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) -                    return '/signature/%s' % dec_s -                dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) -                dash_doc = self._download_xml( -                    dash_manifest_url, video_id, -                    note='Downloading DASH manifest', -                    errnote='Could not download DASH manifest') -                for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): -                    url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') -                    if url_el is None: -                        continue -                    format_id = r.attrib['id'] -                    video_url = url_el.text -                    filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) -                    f = { -                        'format_id': format_id, -                        'url': video_url, -                        'width': int_or_none(r.attrib.get('width')), -                        'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), -                        'asr': int_or_none(r.attrib.get('audioSamplingRate')), -                        'filesize': filesize, -                        'fps': int_or_none(r.attrib.get('frameRate')), -                    } -                    try: -                        existing_format = next( -                            fo for fo in formats -                            if fo['format_id'] == format_id) -                    except StopIteration: -                        f.update(self._formats.get(format_id, {})) -                        formats.append(f) -                    else: -                        existing_format.update(f) - -            except (ExtractorError, KeyError) as e: -                self.report_warning('Skipping DASH manifest: %r' % e, video_id) +            dash_mpd = video_info.get('dashmpd') +            if dash_mpd: +                dash_manifest_url = dash_mpd[0] +                try: +                    dash_formats = self._parse_dash_manifest( +                        video_id, dash_manifest_url, player_url, age_gate) +                except (ExtractorError, KeyError) as e: +                    self.report_warning( +                        'Skipping DASH manifest: %r' % e, video_id) +                else: +                    formats.extend(dash_formats)          self._sort_formats(formats) @@ -1226,7 +1268,7 @@ class YoutubeTopListIE(YoutubePlaylistIE):  class YoutubeChannelIE(InfoExtractor):      IE_DESC = 'YouTube.com channels' -    _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" +    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'      _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'      IE_NAME = 'youtube:channel' @@ -1244,13 +1286,8 @@ class YoutubeChannelIE(InfoExtractor):          return ids_in_page      def _real_extract(self, url): -        # Extract channel id -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) +        channel_id = self._match_id(url) -        # Download channel page -        channel_id = mobj.group(1)          video_ids = []          url = 'https://www.youtube.com/channel/%s/videos' % channel_id          channel_page = self._download_webpage(url, channel_id) @@ -1264,8 +1301,12 @@ class YoutubeChannelIE(InfoExtractor):              # The videos are contained in a single page              # the ajax pages can't be used, they are empty              video_ids = self.extract_videos_from_page(channel_page) -        else: -            # Download all channel pages using the json-based channel_ajax query +            entries = [ +                self.url_result(video_id, 'Youtube', video_id=video_id) +                for video_id in video_ids] +            return self.playlist_result(entries, channel_id) + +        def _entries():              for pagenum in itertools.count(1):                  url = self._MORE_PAGES_URL % (pagenum, channel_id)                  page = self._download_json( @@ -1273,21 +1314,19 @@ class YoutubeChannelIE(InfoExtractor):                      transform_source=uppercase_escape)                  ids_in_page = self.extract_videos_from_page(page['content_html']) -                video_ids.extend(ids_in_page) +                for video_id in ids_in_page: +                    yield self.url_result( +                        video_id, 'Youtube', video_id=video_id)                  if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:                      break -        self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - -        url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) -                       for video_id in video_ids] -        return self.playlist_result(url_entries, channel_id) +        return self.playlist_result(_entries(), channel_id)  class YoutubeUserIE(InfoExtractor):      IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' -    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' +    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'      _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'      _GDATA_PAGE_SIZE = 50      _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' @@ -1315,12 +1354,7 @@ class YoutubeUserIE(InfoExtractor):              return super(YoutubeUserIE, cls).suitable(url)      def _real_extract(self, url): -        # Extract username -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) - -        username = mobj.group(1) +        username = self._match_id(url)          # Download video ids using YouTube Data API. Result size per          # query is limited (currently to 50 videos) so we need to query  | 
