diff options
Diffstat (limited to 'youtube_dl/extractor/youtube.py')
| -rw-r--r-- | youtube_dl/extractor/youtube.py | 139 | 
1 files changed, 66 insertions, 73 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a1a4d896d..874429b78 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,7 +7,6 @@ import itertools  import json  import os.path  import re -import socket  import string  import struct  import traceback @@ -17,9 +16,7 @@ from .common import InfoExtractor, SearchInfoExtractor  from .subtitles import SubtitlesInfoExtractor  from ..utils import (      compat_chr, -    compat_http_client,      compat_parse_qs, -    compat_urllib_error,      compat_urllib_parse,      compat_urllib_request,      compat_urlparse, @@ -45,19 +42,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      # If True it will raise an error if no login info is provided      _LOGIN_REQUIRED = False -    def report_lang(self): -        """Report attempt to set language.""" -        self.to_screen(u'Setting language') -      def _set_language(self): -        request = compat_urllib_request.Request(self._LANG_URL) -        try: -            self.report_lang() -            compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) -            return False -        return True +        return bool(self._download_webpage( +            self._LANG_URL, None, +            note=u'Setting language', errnote='unable to set language', +            fatal=False))      def _login(self):          (username, password) = self._get_login_info() @@ -67,12 +56,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor):                  raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)              return False -        request = compat_urllib_request.Request(self._LOGIN_URL) -        try: -            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) -            return False +        login_page = self._download_webpage( +            self._LOGIN_URL, None, +            note=u'Downloading login page', +            errnote=u'unable to fetch login page', fatal=False) +        if login_page is False: +            return          galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',                                    login_page, u'Login GALX parameter') @@ -102,29 +91,28 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          # chokes on unicode          login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())          login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') -        request = compat_urllib_request.Request(self._LOGIN_URL, login_data) -        try: -            self.report_login() -            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') -            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: -                self._downloader.report_warning(u'unable to log in: bad username or password') -                return False -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + +        req = compat_urllib_request.Request(self._LOGIN_URL, login_data) +        login_results = self._download_webpage( +            req, None, +            note=u'Logging in', errnote=u'unable to log in', fatal=False) +        if login_results is False: +            return False +        if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: +            self._downloader.report_warning(u'unable to log in: bad username or password')              return False          return True      def _confirm_age(self):          age_form = { -                'next_url':     '/', -                'action_confirm':   'Confirm', -                } -        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) -        try: -            self.report_age_confirmation() -            compat_urllib_request.urlopen(request).read().decode('utf-8') -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) +            'next_url': '/', +            'action_confirm': 'Confirm', +        } +        req = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + +        self._download_webpage( +            req, None, +            note=u'Confirming age', errnote=u'Unable to confirm age')          return True      def _real_initialize(self): @@ -336,7 +324,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  u"uploader": u"Philipp Hagemeister",                  u"uploader_id": u"phihag",                  u"upload_date": u"20121002", -                u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." +                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."              }          },          { @@ -388,10 +376,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          super(YoutubeIE, self).__init__(*args, **kwargs)          self._player_cache = {} -    def report_video_webpage_download(self, video_id): -        """Report attempt to download video webpage.""" -        self.to_screen(u'%s: Downloading video webpage' % video_id) -      def report_video_info_webpage_download(self, video_id):          """Report attempt to download video info webpage."""          self.to_screen(u'%s: Downloading video info webpage' % video_id) @@ -1258,15 +1242,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          video_id = self._extract_id(url)          # Get video webpage -        self.report_video_webpage_download(video_id)          url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id -        request = compat_urllib_request.Request(url) -        try: -            video_webpage_bytes = compat_urllib_request.urlopen(request).read() -        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -            raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) - -        video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') +        video_webpage = self._download_webpage(url, video_id)          # Attempt to extract SWF player URL          mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) @@ -1366,6 +1343,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          # description          video_description = get_element_by_id("eow-description", video_webpage)          if video_description: +            video_description = re.sub(r'''(?x) +                <a\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    title="([^"]+)"\s+ +                    (?:[a-zA-Z-]+="[^"]+"\s+)*? +                    class="yt-uix-redirect-link"\s*> +                [^<]+ +                </a> +            ''', r'\1', video_description)              video_description = clean_html(video_description)          else:              fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) @@ -1374,6 +1360,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              else:                  video_description = u'' +        def _extract_count(klass): +            count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False) +            if count is not None: +                return int(count.replace(',', '')) +            return None +        like_count = _extract_count(u'likes-count') +        dislike_count = _extract_count(u'dislikes-count') +          # subtitles          video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -1506,6 +1500,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'annotations':  video_annotations,                  'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,                  'view_count': view_count, +                'like_count': like_count, +                'dislike_count': dislike_count,              })          return results @@ -1520,10 +1516,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):                             \? (?:.*?&)*? (?:p|a|list)=                          |  p/                          ) -                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) +                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})                          .*                       | -                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) +                        ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})                       )"""      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'      _MORE_PAGES_INDICATOR = r'data-link-type="next"' @@ -1545,7 +1541,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):      def _extract_mix(self, playlist_id):          # The mixes are generated from a a single video          # the id of the playlist is just 'RD' + video_id -        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) +        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)          webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')          title_span = (get_element_by_attribute('class', 'title long-title', webpage) or              get_element_by_attribute('class', 'title ', webpage)) @@ -1573,7 +1569,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):              else:                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) -        if len(playlist_id) == 13:  # 'RD' + 11 characters for the video id +        if playlist_id.startswith('RD'):              # Mixes require a custom extraction process              return self._extract_mix(playlist_id)          if playlist_id.startswith('TL'): @@ -1658,10 +1654,11 @@ class YoutubeChannelIE(InfoExtractor):          video_ids = []          url = 'https://www.youtube.com/channel/%s/videos' % channel_id          channel_page = self._download_webpage(url, channel_id) -        if re.search(r'channel-header-autogenerated-label', channel_page) is not None: -            autogenerated = True -        else: -            autogenerated = False +        autogenerated = re.search(r'''(?x) +                class="[^"]*?(?: +                    channel-header-autogenerated-label| +                    yt-channel-title-autogenerated +                )[^"]*"''', channel_page) is not None          if autogenerated:              # The videos are contained in a single page @@ -1763,10 +1760,6 @@ class YoutubeSearchIE(SearchInfoExtractor):      IE_NAME = u'youtube:search'      _SEARCH_KEY = 'ytsearch' -    def report_download_page(self, query, pagenum): -        """Report attempt to download search page with given number.""" -        self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) -      def _get_n_results(self, query, n):          """Get a specified number of results for a query""" @@ -1775,16 +1768,15 @@ class YoutubeSearchIE(SearchInfoExtractor):          limit = n          while (50 * pagenum) < limit: -            self.report_download_page(query, pagenum+1)              result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) -            request = compat_urllib_request.Request(result_url) -            try: -                data = compat_urllib_request.urlopen(request).read().decode('utf-8') -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                raise ExtractorError(u'Unable to download API page: %s' % compat_str(err)) -            api_response = json.loads(data)['data'] - -            if not 'items' in api_response: +            data_json = self._download_webpage( +                result_url, video_id=u'query "%s"' % query, +                note=u'Downloading page %s' % (pagenum + 1), +                errnote=u'Unable to download API page') +            data = json.loads(data_json) +            api_response = data['data'] + +            if 'items' not in api_response:                  raise ExtractorError(u'[youtube] No video results')              new_ids = list(video['id'] for video in api_response['items']) @@ -1800,6 +1792,7 @@ class YoutubeSearchIE(SearchInfoExtractor):          return self.playlist_result(videos, query)  class YoutubeSearchDateIE(YoutubeSearchIE): +    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'      _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'      _SEARCH_KEY = 'ytsearchdate'      IE_DESC = u'YouTube.com searches, newest videos first'  | 
