diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 166 | 
1 files changed, 125 insertions, 41 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cef4dce85..ba46a7bc7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -4,11 +4,11 @@ import re  import socket  import sys  import netrc +import xml.etree.ElementTree  from ..utils import (      compat_http_client,      compat_urllib_error, -    compat_urllib_request,      compat_str,      clean_html, @@ -18,6 +18,8 @@ from ..utils import (      sanitize_filename,      unescapeHTML,  ) +_NO_DEFAULT = object() +  class InfoExtractor(object):      """Information Extractor class. @@ -33,15 +35,39 @@ class InfoExtractor(object):      The dictionaries must include the following fields:      id:             Video identifier. -    url:            Final video URL.      title:          Video title, unescaped. -    ext:            Video filename extension. -    Instead of url and ext, formats can also specified. +    Additionally, it must contain either a formats entry or url and ext: + +    formats:        A list of dictionaries for each format available, it must +                    be ordered from worst to best quality. Potential fields: +                    * url        Mandatory. The URL of the video file +                    * ext        Will be calculated from url if missing +                    * format     A human-readable description of the format +                                 ("mp4 container with h264/opus"). +                                 Calculated from the format_id, width, height. +                                 and format_note fields if missing. +                    * format_id  A short description of the format +                                 ("mp4_h264_opus" or "19") +                    * format_note Additional info about the format +                                 ("3D" or "DASH video") +                    * width      Width of the video, if known +                    * height     Height of the video, if known +                    * abr        Average audio bitrate in KBit/s +                    * acodec     Name of the audio codec in use +                    * vbr        Average video bitrate in KBit/s +                    * vcodec     Name of the video codec in use +                    * filesize   The number of bytes, if known in advance +                    * player_url SWF Player URL (used for rtmpdump). +    url:            Final video URL. +    ext:            Video filename extension. +    format:         The video format, defaults to ext (used for --get-format) +    player_url:     SWF Player URL (used for rtmpdump). +    urlhandle:      [internal] The urlHandle to be used to download the file, +                    like returned by urllib.request.urlopen      The following fields are optional: -    format:         The video format, defaults to ext (used for --get-format)      thumbnails:     A list of dictionaries (with the entries "resolution" and                      "url") for the varying thumbnails      thumbnail:      Full URL to a video thumbnail image. @@ -50,27 +76,17 @@ class InfoExtractor(object):      upload_date:    Video upload date (YYYYMMDD).      uploader_id:    Nickname or id of the video uploader.      location:       Physical location of the video. -    player_url:     SWF Player URL (used for rtmpdump).      subtitles:      The subtitle file contents as a dictionary in the format                      {language: subtitles}. +    duration:       Length of the video in seconds, as an integer.      view_count:     How many users have watched the video on the platform. -    urlhandle:      [internal] The urlHandle to be used to download the file, -                    like returned by urllib.request.urlopen +    like_count:     Number of positive ratings of the video +    dislike_count:  Number of negative ratings of the video +    comment_count:  Number of comments on the video      age_limit:      Age restriction for the video, as an integer (years) -    formats:        A list of dictionaries for each format available, it must -                    be ordered from worst to best quality. Potential fields: -                    * url       Mandatory. The URL of the video file -                    * ext       Will be calculated from url if missing -                    * format    A human-readable description of the format -                                ("mp4 container with h264/opus"). -                                Calculated from the format_id, width, height. -                                and format_note fields if missing. -                    * format_id A short description of the format -                                ("mp4_h264_opus" or "19") -                    * format_note Additional info about the format -                                ("3D" or "DASH video") -                    * width     Width of the video, if known -                    * height    Height of the video, if known +    webpage_url:    The url to the video webpage, if given to youtube-dl it +                    should allow to get the same result again. (It will be set +                    by YoutubeDL if it's missing)      Unless mentioned otherwise, the fields should be Unicode strings. @@ -142,27 +158,40 @@ class InfoExtractor(object):      def IE_NAME(self):          return type(self).__name__[:-2] -    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): +    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns the response handle """          if note is None:              self.report_download_webpage(video_id)          elif note is not False: -            self.to_screen(u'%s: %s' % (video_id, note)) +            if video_id is None: +                self.to_screen(u'%s' % (note,)) +            else: +                self.to_screen(u'%s: %s' % (video_id, note))          try: -            return compat_urllib_request.urlopen(url_or_request) +            return self._downloader.urlopen(url_or_request)          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            if errnote is False: +                return False              if errnote is None:                  errnote = u'Unable to download webpage' -            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err) +            errmsg = u'%s: %s' % (errnote, compat_str(err)) +            if fatal: +                raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) +            else: +                self._downloader.report_warning(errmsg) +                return False -    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): +    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns a tuple (page content as string, URL handle) """          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)):              url_or_request = url_or_request.partition('#')[0] -        urlh = self._request_webpage(url_or_request, video_id, note, errnote) +        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) +        if urlh is False: +            assert not fatal +            return False          content_type = urlh.headers.get('Content-Type', '')          webpage_bytes = urlh.read()          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -197,9 +226,23 @@ class InfoExtractor(object):          content = webpage_bytes.decode(encoding, 'replace')          return (content, urlh) -    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): +    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns the data of the page as a string """ -        return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] +        res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) +        if res is False: +            return res +        else: +            content, _ = res +            return content + +    def _download_xml(self, url_or_request, video_id, +                      note=u'Downloading XML', errnote=u'Unable to download XML', +                      transform_source=None): +        """Return the xml as an xml.etree.ElementTree.Element""" +        xml_string = self._download_webpage(url_or_request, video_id, note, errnote) +        if transform_source: +            xml_string = transform_source(xml_string) +        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))      def to_screen(self, msg):          """Print msg to screen, prefixing it with '[ie_name]'""" @@ -222,14 +265,18 @@ class InfoExtractor(object):          self.to_screen(u'Logging in')      #Methods for following #608 -    def url_result(self, url, ie=None): +    @staticmethod +    def url_result(url, ie=None, video_id=None):          """Returns a url that points to a page that should be processed"""          #TODO: ie should be the class used for getting the info          video_info = {'_type': 'url',                        'url': url,                        'ie_key': ie} +        if video_id is not None: +            video_info['id'] = video_id          return video_info -    def playlist_result(self, entries, playlist_id=None, playlist_title=None): +    @staticmethod +    def playlist_result(entries, playlist_id=None, playlist_title=None):          """Returns a playlist"""          video_info = {'_type': 'playlist',                        'entries': entries} @@ -239,7 +286,7 @@ class InfoExtractor(object):              video_info['title'] = playlist_title          return video_info -    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): +    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):          """          Perform a regex search on the given string, using a single or a list of          patterns returning the first matching group. @@ -253,7 +300,7 @@ class InfoExtractor(object):                  mobj = re.search(p, string, flags)                  if mobj: break -        if sys.stderr.isatty() and os.name != 'nt': +        if os.name != 'nt' and sys.stderr.isatty():              _name = u'\033[0;34m%s\033[0m' % name          else:              _name = name @@ -261,7 +308,7 @@ class InfoExtractor(object):          if mobj:              # return the first matching group              return next(g for g in mobj.groups() if g is not None) -        elif default is not None: +        elif default is not _NO_DEFAULT:              return default          elif fatal:              raise RegexNotFoundError(u'Unable to extract %s' % _name) @@ -270,7 +317,7 @@ class InfoExtractor(object):                  u'please report this issue on http://yt-dl.org/bug' % _name)              return None -    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): +    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):          """          Like _search_regex, but strips HTML tags and unescapes entities.          """ @@ -312,13 +359,21 @@ class InfoExtractor(object):      # Helper functions for extracting OpenGraph info      @staticmethod -    def _og_regex(prop): -        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) +    def _og_regexes(prop): +        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' +        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) +        template = r'<meta[^>]+?%s[^>]+?%s' +        return [ +            template % (property_re, content_re), +            template % (content_re, property_re), +        ]      def _og_search_property(self, prop, html, name=None, **kargs):          if name is None:              name = 'OpenGraph %s' % prop -        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) +        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) +        if escaped is None: +            return None          return unescapeHTML(escaped)      def _og_search_thumbnail(self, html, **kargs): @@ -331,10 +386,22 @@ class InfoExtractor(object):          return self._og_search_property('title', html, **kargs)      def _og_search_video_url(self, html, name='video url', secure=True, **kargs): -        regexes = [self._og_regex('video')] -        if secure: regexes.insert(0, self._og_regex('video:secure_url')) +        regexes = self._og_regexes('video') +        if secure: regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs) +    def _html_search_meta(self, name, html, display_name=None): +        if display_name is None: +            display_name = name +        return self._html_search_regex( +            r'''(?ix)<meta +                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) +                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), +            html, display_name, fatal=False) + +    def _dc_search_uploader(self, html): +        return self._html_search_meta('dc.creator', html, 'uploader') +      def _rta_search(self, html):          # See http://www.rtalabel.org/index.php?content=howtofaq#single          if re.search(r'(?ix)<meta\s+name="rating"\s+' @@ -343,6 +410,23 @@ class InfoExtractor(object):              return 18          return 0 +    def _media_rating_search(self, html): +        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ +        rating = self._html_search_meta('rating', html) + +        if not rating: +            return None + +        RATING_TABLE = { +            'safe for kids': 0, +            'general': 8, +            '14 years': 14, +            'mature': 17, +            'restricted': 19, +        } +        return RATING_TABLE.get(rating.lower(), None) + +  class SearchInfoExtractor(InfoExtractor):      """ | 
