diff options
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r-- | youtube_dl/extractor/common.py | 198 |
1 files changed, 154 insertions, 44 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e68657314..929dd1e97 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import base64 import hashlib import json @@ -18,6 +20,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + int_or_none, RegexNotFoundError, sanitize_filename, unescapeHTML, @@ -69,6 +72,7 @@ class InfoExtractor(object): * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -82,6 +86,12 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * http_referer HTTP Referer header value to set. + * http_method HTTP method to use for the download. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * http_post_data Additional data to send with a POST + request. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -106,7 +116,7 @@ class InfoExtractor(object): upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. - location: Physical location of the video. + location: Physical location where the video was filmed. subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. duration: Length of the video in seconds, as an integer. @@ -194,17 +204,17 @@ class InfoExtractor(object): self.report_download_webpage(video_id) elif note is not False: if video_id is None: - self.to_screen(u'%s' % (note,)) + self.to_screen('%s' % (note,)) else: - self.to_screen(u'%s: %s' % (video_id, note)) + self.to_screen('%s: %s' % (video_id, note)) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is False: return False if errnote is None: - errnote = u'Unable to download webpage' - errmsg = u'%s: %s' % (errnote, compat_str(err)) + errnote = 'Unable to download webpage' + errmsg = '%s: %s' % (errnote, compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -241,7 +251,7 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - self.to_screen(u'Dumping request to ' + url) + self.to_screen('Dumping request to ' + url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self._downloader.params.get('write_pages', False): @@ -251,11 +261,11 @@ class InfoExtractor(object): url = url_or_request basen = '%s_%s' % (video_id, url) if len(basen) > 240: - h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() basen = basen[:240 - len(h)] + h raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen(u'Saving request to ' + filename) + self.to_screen('Saving request to ' + filename) with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -264,14 +274,14 @@ class InfoExtractor(object): except LookupError: content = webpage_bytes.decode('utf-8', 'replace') - if (u'<title>Access to this site is blocked</title>' in content and - u'Websense' in content[:512]): - msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + if ('<title>Access to this site is blocked</title>' in content and + 'Websense' in content[:512]): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' blocked_iframe = self._html_search_regex( r'<iframe src="([^"]+)"', content, - u'Websense information URL', default=None) + 'Websense information URL', default=None) if blocked_iframe: - msg += u' Visit %s for more details' % blocked_iframe + msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) return (content, urlh) @@ -286,7 +296,7 @@ class InfoExtractor(object): return content def _download_xml(self, url_or_request, video_id, - note=u'Downloading XML', errnote=u'Unable to download XML', + note='Downloading XML', errnote='Unable to download XML', transform_source=None, fatal=True): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( @@ -298,10 +308,14 @@ class InfoExtractor(object): return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, - note=u'Downloading JSON metadata', - errnote=u'Unable to download JSON metadata', - transform_source=None): - json_string = self._download_webpage(url_or_request, video_id, note, errnote) + note='Downloading JSON metadata', + errnote='Unable to download JSON metadata', + transform_source=None, + fatal=True): + json_string = self._download_webpage( + url_or_request, video_id, note, errnote, fatal=fatal) + if (not fatal) and json_string is False: + return None if transform_source: json_string = transform_source(json_string) try: @@ -310,29 +324,29 @@ class InfoExtractor(object): raise ExtractorError('Failed to download JSON', cause=ve) def report_warning(self, msg, video_id=None): - idstr = u'' if video_id is None else u'%s: ' % video_id + idstr = '' if video_id is None else '%s: ' % video_id self._downloader.report_warning( - u'[%s] %s%s' % (self.IE_NAME, idstr, msg)) + '[%s] %s%s' % (self.IE_NAME, idstr, msg)) def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) + self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) def report_extraction(self, id_or_name): """Report information extraction.""" - self.to_screen(u'%s: Extracting information' % id_or_name) + self.to_screen('%s: Extracting information' % id_or_name) def report_download_webpage(self, video_id): """Report webpage download.""" - self.to_screen(u'%s: Downloading webpage' % video_id) + self.to_screen('%s: Downloading webpage' % video_id) def report_age_confirmation(self): """Report attempt to confirm age.""" - self.to_screen(u'Confirming age') + self.to_screen('Confirming age') def report_login(self): """Report attempt to log in.""" - self.to_screen(u'Logging in') + self.to_screen('Logging in') #Methods for following #608 @staticmethod @@ -368,10 +382,11 @@ class InfoExtractor(object): else: for p in pattern: mobj = re.search(p, string, flags) - if mobj: break + if mobj: + break if os.name != 'nt' and sys.stderr.isatty(): - _name = u'\033[0;34m%s\033[0m' % name + _name = '\033[0;34m%s\033[0m' % name else: _name = name @@ -381,10 +396,10 @@ class InfoExtractor(object): elif default is not _NO_DEFAULT: return default elif fatal: - raise RegexNotFoundError(u'Unable to extract %s' % _name) + raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on http://yt-dl.org/bug' % _name) + self._downloader.report_warning('unable to extract %s; ' + 'please report this issue on http://yt-dl.org/bug' % _name) return None def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): @@ -423,10 +438,26 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) + self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) return (username, password) + def _get_tfa_info(self): + """ + Get the two-factor authentication info + TODO - asking the user will be required for sms/phone verify + currently just uses the command line option + If there's no info available, return None + """ + if self._downloader is None: + return None + downloader_params = self._downloader.params + + if downloader_params.get('twofactor', None) is not None: + return downloader_params['twofactor'] + + return None + # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): @@ -447,7 +478,7 @@ class InfoExtractor(object): return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) + return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) @@ -456,8 +487,9 @@ class InfoExtractor(object): return self._og_search_property('title', html, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): - regexes = self._og_regexes('video') - if secure: regexes = self._og_regexes('video:secure_url') + regexes + regexes = self._og_regexes('video') + self._og_regexes('video:url') + if secure: + regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) def _og_search_url(self, html, **kargs): @@ -468,7 +500,7 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)<meta - (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) + (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), html, display_name, fatal=fatal, **kwargs) @@ -505,7 +537,7 @@ class InfoExtractor(object): def _sort_formats(self, formats): if not formats: - raise ExtractorError(u'No video formats found') + raise ExtractorError('No video formats found') def _formats_key(f): # TODO remove the following workaround @@ -525,9 +557,9 @@ class InfoExtractor(object): if f.get('vcodec') == 'none': # audio only if self._downloader.params.get('prefer_free_formats'): - ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus'] + ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: - ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] + ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] ext_preference = 0 try: audio_ext_preference = ORDER.index(f['ext']) @@ -535,9 +567,9 @@ class InfoExtractor(object): audio_ext_preference = -1 else: if self._downloader.params.get('prefer_free_formats'): - ORDER = [u'flv', u'mp4', u'webm'] + ORDER = ['flv', 'mp4', 'webm'] else: - ORDER = [u'webm', u'flv', u'mp4'] + ORDER = ['webm', 'flv', 'mp4'] try: ext_preference = ORDER.index(f['ext']) except ValueError: @@ -555,6 +587,7 @@ class InfoExtractor(object): f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, + f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) @@ -578,11 +611,88 @@ class InfoExtractor(object): def _sleep(self, timeout, video_id, msg_template=None): if msg_template is None: - msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds' + msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' msg = msg_template % {'video_id': video_id, 'timeout': timeout} self.to_screen(msg) time.sleep(timeout) + def _extract_f4m_formats(self, manifest_url, video_id): + manifest = self._download_xml( + manifest_url, video_id, 'Downloading f4m manifest', + 'Unable to download f4m manifest') + + formats = [] + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') + for i, media_el in enumerate(media_nodes): + tbr = int_or_none(media_el.attrib.get('bitrate')) + format_id = 'f4m-%d' % (i if tbr is None else tbr) + formats.append({ + 'format_id': format_id, + 'url': manifest_url, + 'ext': 'flv', + 'tbr': tbr, + 'width': int_or_none(media_el.attrib.get('width')), + 'height': int_or_none(media_el.attrib.get('height')), + }) + self._sort_formats(formats) + + return formats + + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None): + formats = [{ + 'format_id': 'm3u8-meta', + 'url': m3u8_url, + 'ext': ext, + 'protocol': 'm3u8', + 'preference': -1, + 'resolution': 'multiple', + 'format_note': 'Quality selection URL', + }] + + m3u8_doc = self._download_webpage(m3u8_url, video_id) + last_info = None + kv_rex = re.compile( + r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)') + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-STREAM-INF:'): + last_info = {} + for m in kv_rex.finditer(line): + v = m.group('val') + if v.startswith('"'): + v = v[1:-1] + last_info[m.group('key')] = v + elif line.startswith('#') or not line.strip(): + continue + else: + if last_info is None: + formats.append({'url': line}) + continue + tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + + f = { + 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), + 'url': line.strip(), + 'tbr': tbr, + 'ext': ext, + } + codecs = last_info.get('CODECS') + if codecs: + # TODO: looks like video codec is not always necessarily goes first + va_codecs = codecs.split(',') + if va_codecs[0]: + f['vcodec'] = va_codecs[0].partition('.')[0] + if len(va_codecs) > 1 and va_codecs[1]: + f['acodec'] = va_codecs[1].partition('.')[0] + resolution = last_info.get('RESOLUTION') + if resolution: + width_str, height_str = resolution.split('x') + f['width'] = int(width_str) + f['height'] = int(height_str) + formats.append(f) + last_info = {} + self._sort_formats(formats) + return formats + class SearchInfoExtractor(InfoExtractor): """ @@ -602,7 +712,7 @@ class SearchInfoExtractor(InfoExtractor): def _real_extract(self, query): mobj = re.match(self._make_valid_url(), query) if mobj is None: - raise ExtractorError(u'Invalid search query "%s"' % query) + raise ExtractorError('Invalid search query "%s"' % query) prefix = mobj.group('prefix') query = mobj.group('query') @@ -613,9 +723,9 @@ class SearchInfoExtractor(InfoExtractor): else: n = int(prefix) if n <= 0: - raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) + raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) elif n > self._MAX_RESULTS: - self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) + self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) n = self._MAX_RESULTS return self._get_n_results(query, n) |