diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
| -rw-r--r-- | youtube_dl/extractor/common.py | 198 | 
1 files changed, 154 insertions, 44 deletions
| diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e68657314..929dd1e97 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import base64  import hashlib  import json @@ -18,6 +20,7 @@ from ..utils import (      clean_html,      compiled_regex_type,      ExtractorError, +    int_or_none,      RegexNotFoundError,      sanitize_filename,      unescapeHTML, @@ -69,6 +72,7 @@ class InfoExtractor(object):                      * vcodec     Name of the video codec in use                      * container  Name of the container format                      * filesize   The number of bytes, if known in advance +                    * filesize_approx  An estimate for the number of bytes                      * player_url SWF Player URL (used for rtmpdump).                      * protocol   The protocol that will be used for the actual                                   download, lower-case. @@ -82,6 +86,12 @@ class InfoExtractor(object):                                   format, irrespective of the file format.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                    * http_referer  HTTP Referer header value to set. +                    * http_method  HTTP method to use for the download. +                    * http_headers  A dictionary of additional HTTP headers +                                 to add to the request. +                    * http_post_data  Additional data to send with a POST +                                 request.      url:            Final video URL.      ext:            Video filename extension.      format:         The video format, defaults to ext (used for --get-format) @@ -106,7 +116,7 @@ class InfoExtractor(object):      upload_date:    Video upload date (YYYYMMDD).                      If not explicitly set, calculated from timestamp.      uploader_id:    Nickname or id of the video uploader. -    location:       Physical location of the video. +    location:       Physical location where the video was filmed.      subtitles:      The subtitle file contents as a dictionary in the format                      {language: subtitles}.      duration:       Length of the video in seconds, as an integer. @@ -194,17 +204,17 @@ class InfoExtractor(object):              self.report_download_webpage(video_id)          elif note is not False:              if video_id is None: -                self.to_screen(u'%s' % (note,)) +                self.to_screen('%s' % (note,))              else: -                self.to_screen(u'%s: %s' % (video_id, note)) +                self.to_screen('%s: %s' % (video_id, note))          try:              return self._downloader.urlopen(url_or_request)          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:              if errnote is False:                  return False              if errnote is None: -                errnote = u'Unable to download webpage' -            errmsg = u'%s: %s' % (errnote, compat_str(err)) +                errnote = 'Unable to download webpage' +            errmsg = '%s: %s' % (errnote, compat_str(err))              if fatal:                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)              else: @@ -241,7 +251,7 @@ class InfoExtractor(object):                  url = url_or_request.get_full_url()              except AttributeError:                  url = url_or_request -            self.to_screen(u'Dumping request to ' + url) +            self.to_screen('Dumping request to ' + url)              dump = base64.b64encode(webpage_bytes).decode('ascii')              self._downloader.to_screen(dump)          if self._downloader.params.get('write_pages', False): @@ -251,11 +261,11 @@ class InfoExtractor(object):                  url = url_or_request              basen = '%s_%s' % (video_id, url)              if len(basen) > 240: -                h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() +                h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()                  basen = basen[:240 - len(h)] + h              raw_filename = basen + '.dump'              filename = sanitize_filename(raw_filename, restricted=True) -            self.to_screen(u'Saving request to ' + filename) +            self.to_screen('Saving request to ' + filename)              with open(filename, 'wb') as outf:                  outf.write(webpage_bytes) @@ -264,14 +274,14 @@ class InfoExtractor(object):          except LookupError:              content = webpage_bytes.decode('utf-8', 'replace') -        if (u'<title>Access to this site is blocked</title>' in content and -                u'Websense' in content[:512]): -            msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' +        if ('<title>Access to this site is blocked</title>' in content and +                'Websense' in content[:512]): +            msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'              blocked_iframe = self._html_search_regex(                  r'<iframe src="([^"]+)"', content, -                u'Websense information URL', default=None) +                'Websense information URL', default=None)              if blocked_iframe: -                msg += u' Visit %s for more details' % blocked_iframe +                msg += ' Visit %s for more details' % blocked_iframe              raise ExtractorError(msg, expected=True)          return (content, urlh) @@ -286,7 +296,7 @@ class InfoExtractor(object):              return content      def _download_xml(self, url_or_request, video_id, -                      note=u'Downloading XML', errnote=u'Unable to download XML', +                      note='Downloading XML', errnote='Unable to download XML',                        transform_source=None, fatal=True):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( @@ -298,10 +308,14 @@ class InfoExtractor(object):          return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))      def _download_json(self, url_or_request, video_id, -                       note=u'Downloading JSON metadata', -                       errnote=u'Unable to download JSON metadata', -                       transform_source=None): -        json_string = self._download_webpage(url_or_request, video_id, note, errnote) +                       note='Downloading JSON metadata', +                       errnote='Unable to download JSON metadata', +                       transform_source=None, +                       fatal=True): +        json_string = self._download_webpage( +            url_or_request, video_id, note, errnote, fatal=fatal) +        if (not fatal) and json_string is False: +            return None          if transform_source:              json_string = transform_source(json_string)          try: @@ -310,29 +324,29 @@ class InfoExtractor(object):              raise ExtractorError('Failed to download JSON', cause=ve)      def report_warning(self, msg, video_id=None): -        idstr = u'' if video_id is None else u'%s: ' % video_id +        idstr = '' if video_id is None else '%s: ' % video_id          self._downloader.report_warning( -            u'[%s] %s%s' % (self.IE_NAME, idstr, msg)) +            '[%s] %s%s' % (self.IE_NAME, idstr, msg))      def to_screen(self, msg):          """Print msg to screen, prefixing it with '[ie_name]'""" -        self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) +        self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))      def report_extraction(self, id_or_name):          """Report information extraction.""" -        self.to_screen(u'%s: Extracting information' % id_or_name) +        self.to_screen('%s: Extracting information' % id_or_name)      def report_download_webpage(self, video_id):          """Report webpage download.""" -        self.to_screen(u'%s: Downloading webpage' % video_id) +        self.to_screen('%s: Downloading webpage' % video_id)      def report_age_confirmation(self):          """Report attempt to confirm age.""" -        self.to_screen(u'Confirming age') +        self.to_screen('Confirming age')      def report_login(self):          """Report attempt to log in.""" -        self.to_screen(u'Logging in') +        self.to_screen('Logging in')      #Methods for following #608      @staticmethod @@ -368,10 +382,11 @@ class InfoExtractor(object):          else:              for p in pattern:                  mobj = re.search(p, string, flags) -                if mobj: break +                if mobj: +                    break          if os.name != 'nt' and sys.stderr.isatty(): -            _name = u'\033[0;34m%s\033[0m' % name +            _name = '\033[0;34m%s\033[0m' % name          else:              _name = name @@ -381,10 +396,10 @@ class InfoExtractor(object):          elif default is not _NO_DEFAULT:              return default          elif fatal: -            raise RegexNotFoundError(u'Unable to extract %s' % _name) +            raise RegexNotFoundError('Unable to extract %s' % _name)          else: -            self._downloader.report_warning(u'unable to extract %s; ' -                u'please report this issue on http://yt-dl.org/bug' % _name) +            self._downloader.report_warning('unable to extract %s; ' +                'please report this issue on http://yt-dl.org/bug' % _name)              return None      def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): @@ -423,10 +438,26 @@ class InfoExtractor(object):                  else:                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)              except (IOError, netrc.NetrcParseError) as err: -                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) +                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))          return (username, password) +    def _get_tfa_info(self): +        """ +        Get the two-factor authentication info +        TODO - asking the user will be required for sms/phone verify +        currently just uses the command line option +        If there's no info available, return None +        """ +        if self._downloader is None: +            return None +        downloader_params = self._downloader.params + +        if downloader_params.get('twofactor', None) is not None: +            return downloader_params['twofactor'] + +        return None +      # Helper functions for extracting OpenGraph info      @staticmethod      def _og_regexes(prop): @@ -447,7 +478,7 @@ class InfoExtractor(object):          return unescapeHTML(escaped)      def _og_search_thumbnail(self, html, **kargs): -        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) +        return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)      def _og_search_description(self, html, **kargs):          return self._og_search_property('description', html, fatal=False, **kargs) @@ -456,8 +487,9 @@ class InfoExtractor(object):          return self._og_search_property('title', html, **kargs)      def _og_search_video_url(self, html, name='video url', secure=True, **kargs): -        regexes = self._og_regexes('video') -        if secure: regexes = self._og_regexes('video:secure_url') + regexes +        regexes = self._og_regexes('video') + self._og_regexes('video:url') +        if secure: +            regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs)      def _og_search_url(self, html, **kargs): @@ -468,7 +500,7 @@ class InfoExtractor(object):              display_name = name          return self._html_search_regex(              r'''(?ix)<meta -                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) +                    (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)                      [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),              html, display_name, fatal=fatal, **kwargs) @@ -505,7 +537,7 @@ class InfoExtractor(object):      def _sort_formats(self, formats):          if not formats: -            raise ExtractorError(u'No video formats found') +            raise ExtractorError('No video formats found')          def _formats_key(f):              # TODO remove the following workaround @@ -525,9 +557,9 @@ class InfoExtractor(object):              if f.get('vcodec') == 'none':  # audio only                  if self._downloader.params.get('prefer_free_formats'): -                    ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus'] +                    ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']                  else: -                    ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] +                    ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']                  ext_preference = 0                  try:                      audio_ext_preference = ORDER.index(f['ext']) @@ -535,9 +567,9 @@ class InfoExtractor(object):                      audio_ext_preference = -1              else:                  if self._downloader.params.get('prefer_free_formats'): -                    ORDER = [u'flv', u'mp4', u'webm'] +                    ORDER = ['flv', 'mp4', 'webm']                  else: -                    ORDER = [u'webm', u'flv', u'mp4'] +                    ORDER = ['webm', 'flv', 'mp4']                  try:                      ext_preference = ORDER.index(f['ext'])                  except ValueError: @@ -555,6 +587,7 @@ class InfoExtractor(object):                  f.get('abr') if f.get('abr') is not None else -1,                  audio_ext_preference,                  f.get('filesize') if f.get('filesize') is not None else -1, +                f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,                  f.get('format_id'),              )          formats.sort(key=_formats_key) @@ -578,11 +611,88 @@ class InfoExtractor(object):      def _sleep(self, timeout, video_id, msg_template=None):          if msg_template is None: -            msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds' +            msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'          msg = msg_template % {'video_id': video_id, 'timeout': timeout}          self.to_screen(msg)          time.sleep(timeout) +    def _extract_f4m_formats(self, manifest_url, video_id): +        manifest = self._download_xml( +            manifest_url, video_id, 'Downloading f4m manifest', +            'Unable to download f4m manifest') + +        formats = [] +        media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') +        for i, media_el in enumerate(media_nodes): +            tbr = int_or_none(media_el.attrib.get('bitrate')) +            format_id = 'f4m-%d' % (i if tbr is None else tbr) +            formats.append({ +                'format_id': format_id, +                'url': manifest_url, +                'ext': 'flv', +                'tbr': tbr, +                'width': int_or_none(media_el.attrib.get('width')), +                'height': int_or_none(media_el.attrib.get('height')), +            }) +        self._sort_formats(formats) + +        return formats + +    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None): +        formats = [{ +            'format_id': 'm3u8-meta', +            'url': m3u8_url, +            'ext': ext, +            'protocol': 'm3u8', +            'preference': -1, +            'resolution': 'multiple', +            'format_note': 'Quality selection URL', +        }] + +        m3u8_doc = self._download_webpage(m3u8_url, video_id) +        last_info = None +        kv_rex = re.compile( +            r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)') +        for line in m3u8_doc.splitlines(): +            if line.startswith('#EXT-X-STREAM-INF:'): +                last_info = {} +                for m in kv_rex.finditer(line): +                    v = m.group('val') +                    if v.startswith('"'): +                        v = v[1:-1] +                    last_info[m.group('key')] = v +            elif line.startswith('#') or not line.strip(): +                continue +            else: +                if last_info is None: +                    formats.append({'url': line}) +                    continue +                tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + +                f = { +                    'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), +                    'url': line.strip(), +                    'tbr': tbr, +                    'ext': ext, +                } +                codecs = last_info.get('CODECS') +                if codecs: +                    # TODO: looks like video codec is not always necessarily goes first +                    va_codecs = codecs.split(',') +                    if va_codecs[0]: +                        f['vcodec'] = va_codecs[0].partition('.')[0] +                    if len(va_codecs) > 1 and va_codecs[1]: +                        f['acodec'] = va_codecs[1].partition('.')[0] +                resolution = last_info.get('RESOLUTION') +                if resolution: +                    width_str, height_str = resolution.split('x') +                    f['width'] = int(width_str) +                    f['height'] = int(height_str) +                formats.append(f) +                last_info = {} +        self._sort_formats(formats) +        return formats +  class SearchInfoExtractor(InfoExtractor):      """ @@ -602,7 +712,7 @@ class SearchInfoExtractor(InfoExtractor):      def _real_extract(self, query):          mobj = re.match(self._make_valid_url(), query)          if mobj is None: -            raise ExtractorError(u'Invalid search query "%s"' % query) +            raise ExtractorError('Invalid search query "%s"' % query)          prefix = mobj.group('prefix')          query = mobj.group('query') @@ -613,9 +723,9 @@ class SearchInfoExtractor(InfoExtractor):          else:              n = int(prefix)              if n <= 0: -                raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) +                raise ExtractorError('invalid download number %s for query "%s"' % (n, query))              elif n > self._MAX_RESULTS: -                self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) +                self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))                  n = self._MAX_RESULTS              return self._get_n_results(query, n) | 
