diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 35 | 
1 files changed, 21 insertions, 14 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a0c4af92f..6ab2d68d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,20 +10,18 @@ import re  import socket  import sys  import time -import xml.etree.ElementTree  from ..compat import (      compat_cookiejar,      compat_cookies,      compat_getpass, -    compat_HTTPError,      compat_http_client,      compat_urllib_error,      compat_urllib_parse,      compat_urllib_parse_urlparse, -    compat_urllib_request,      compat_urlparse,      compat_str, +    compat_etree_fromstring,  )  from ..utils import (      NO_DEFAULT, @@ -38,6 +36,7 @@ from ..utils import (      int_or_none,      RegexNotFoundError,      sanitize_filename, +    sanitized_Request,      unescapeHTML,      unified_strdate,      url_basename, @@ -168,10 +167,11 @@ class InfoExtractor(object):                      "ext" will be calculated from URL if missing      automatic_captions: Like 'subtitles', used by the YoutubeIE for                      automatically generated captions -    duration:       Length of the video in seconds, as an integer. +    duration:       Length of the video in seconds, as an integer or float.      view_count:     How many users have watched the video on the platform.      like_count:     Number of positive ratings of the video      dislike_count:  Number of negative ratings of the video +    repost_count:   Number of reposts of the video      average_rating: Average rating give by users, the scale used depends on the webpage      comment_count:  Number of comments on the video      comments:       A list of comments, each with one or more of the following @@ -310,11 +310,11 @@ class InfoExtractor(object):      @classmethod      def ie_key(cls):          """A string for getting the InfoExtractor with get_info_extractor""" -        return cls.__name__[:-2] +        return compat_str(cls.__name__[:-2])      @property      def IE_NAME(self): -        return type(self).__name__[:-2] +        return compat_str(type(self).__name__[:-2])      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns the response handle """ @@ -461,7 +461,7 @@ class InfoExtractor(object):              return xml_string          if transform_source:              xml_string = transform_source(xml_string) -        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) +        return compat_etree_fromstring(xml_string.encode('utf-8'))      def _download_json(self, url_or_request, video_id,                         note='Downloading JSON metadata', @@ -645,7 +645,7 @@ class InfoExtractor(object):      # Helper functions for extracting OpenGraph info      @staticmethod      def _og_regexes(prop): -        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' +        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'          property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'                         % {'prop': re.escape(prop)})          template = r'<meta[^>]+?%s[^>]+?%s' @@ -841,7 +841,7 @@ class InfoExtractor(object):              self._request_webpage(url, video_id, 'Checking %s URL' % item)              return True          except ExtractorError as e: -            if isinstance(e.cause, compat_HTTPError): +            if isinstance(e.cause, compat_urllib_error.URLError):                  self.to_screen(                      '%s: %s URL is invalid, skipping' % (video_id, item))                  return False @@ -891,6 +891,11 @@ class InfoExtractor(object):          if not media_nodes:              manifest_version = '2.0'              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') +        base_url = xpath_text( +            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], +            'base URL', default=None) +        if base_url: +            base_url = base_url.strip()          for i, media_el in enumerate(media_nodes):              if manifest_version == '2.0':                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url') @@ -898,7 +903,7 @@ class InfoExtractor(object):                      continue                  manifest_url = (                      media_url if media_url.startswith('http://') or media_url.startswith('https://') -                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) +                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))                  # If media_url is itself a f4m manifest do the recursive extraction                  # since bitrates in parent manifest (this one) and media_url manifest                  # may differ leading to inability to resolve the format by requested @@ -943,13 +948,15 @@ class InfoExtractor(object):              if re.match(r'^https?://', u)              else compat_urlparse.urljoin(m3u8_url, u)) -        m3u8_doc = self._download_webpage( +        res = self._download_webpage_handle(              m3u8_url, video_id,              note=note or 'Downloading m3u8 information',              errnote=errnote or 'Failed to download m3u8 information',              fatal=fatal) -        if m3u8_doc is False: -            return m3u8_doc +        if res is False: +            return res +        m3u8_doc, urlh = res +        m3u8_url = urlh.geturl()          last_info = None          last_media = None          kv_rex = re.compile( @@ -1278,7 +1285,7 @@ class InfoExtractor(object):      def _get_cookies(self, url):          """ Return a compat_cookies.SimpleCookie with the cookies for the url """ -        req = compat_urllib_request.Request(url) +        req = sanitized_Request(url)          self._downloader.cookiejar.add_cookie_header(req)          return compat_cookies.SimpleCookie(req.get_header('Cookie')) | 
