diff options
Diffstat (limited to 'youtube_dl')
41 files changed, 647 insertions, 578 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 73a372df4..8732f3db4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -22,13 +22,15 @@ import traceback if os.name == 'nt': import ctypes -from .utils import ( +from .compat import ( compat_cookiejar, compat_expanduser, compat_http_client, compat_str, compat_urllib_error, compat_urllib_request, +) +from .utils import ( escape_url, ContentTooShortError, date_from_str, @@ -62,6 +64,7 @@ from .utils import ( from .cache import Cache from .extractor import get_info_extractor, gen_extractors from .downloader import get_suitable_downloader +from .downloader.rtmp import rtmpdump_version from .postprocessor import FFmpegMergerPP, FFmpegPostProcessor from .version import __version__ @@ -1321,6 +1324,7 @@ class YoutubeDL(object): platform.python_version(), platform_name())) exe_versions = FFmpegPostProcessor.get_versions() + exe_versions['rtmpdump'] = rtmpdump_version() exe_str = ', '.join( '%s %s' % (exe, v) for exe, v in sorted(exe_versions.items()) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3c968082c..685dd8e5e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -13,10 +13,12 @@ import sys from .options import ( parseOpts, ) -from .utils import ( +from .compat import ( compat_expanduser, compat_getpass, compat_print, +) +from .utils import ( DateRange, DEFAULT_OUTTMPL, decodeOption, diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index ac5925d32..2d9b426cb 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -8,10 +8,8 @@ import re import shutil import traceback -from .utils import ( - compat_expanduser, - write_json_file, -) +from .compat import compat_expanduser +from .utils import write_json_file class Cache(object): diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py new file mode 100644 index 000000000..b3752634a --- /dev/null +++ b/youtube_dl/compat.py @@ -0,0 +1,317 @@ +from __future__ import unicode_literals + +import getpass +import os +import subprocess +import sys + + +try: + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib as compat_urllib_parse + +try: + from urllib.parse import urlparse as compat_urllib_parse_urlparse +except ImportError: # Python 2 + from urlparse import urlparse as compat_urllib_parse_urlparse + +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + +try: + import html.entities as compat_html_entities +except ImportError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: + import html.parser as compat_html_parser +except ImportError: # Python 2 + import HTMLParser as compat_html_parser + +try: + import http.client as compat_http_client +except ImportError: # Python 2 + import httplib as compat_http_client + +try: + from urllib.error import HTTPError as compat_HTTPError +except ImportError: # Python 2 + from urllib2 import HTTPError as compat_HTTPError + +try: + from urllib.request import urlretrieve as compat_urlretrieve +except ImportError: # Python 2 + from urllib import urlretrieve as compat_urlretrieve + + +try: + from subprocess import DEVNULL + compat_subprocess_get_DEVNULL = lambda: DEVNULL +except ImportError: + compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') + +try: + from urllib.parse import unquote as compat_urllib_parse_unquote +except ImportError: + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + if string == '': + return string + res = string.split('%') + if len(res) == 1: + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + pct_sequence = b'' + string = res[0] + for item in res[1:]: + try: + if not item: + raise ValueError + pct_sequence += item[:2].decode('hex') + rest = item[2:] + if not rest: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + continue + except ValueError: + rest = '%' + item + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + string += pct_sequence.decode(encoding, errors) + rest + pct_sequence = b'' + if pct_sequence: + # Flush the final pct_sequence + string += pct_sequence.decode(encoding, errors) + return string + + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. + # Python 2's version is apparently totally broken + + def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + qs, _coerce_result = qs, unicode + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value and not strict_parsing: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + if strict_parsing: + raise ValueError("bad query field: %r" % (name_value,)) + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: + continue + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = compat_urllib_parse_unquote( + name, encoding=encoding, errors=errors) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = compat_urllib_parse_unquote( + value, encoding=encoding, errors=errors) + value = _coerce_result(value) + r.append((name, value)) + return r + + def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, + encoding='utf-8', errors='replace'): + parsed_result = {} + pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, + encoding=encoding, errors=errors) + for name, value in pairs: + if name in parsed_result: + parsed_result[name].append(value) + else: + parsed_result[name] = [value] + return parsed_result + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr + +try: + from xml.etree.ElementTree import ParseError as compat_xml_parse_error +except ImportError: # Python 2.6 + from xml.parsers.expat import ExpatError as compat_xml_parse_error + +try: + from shlex import quote as shlex_quote +except ImportError: # Python < 3.3 + def shlex_quote(s): + return "'" + s.replace("'", "'\"'\"'") + "'" + + +def compat_ord(c): + if type(c) is int: return c + else: return ord(c) + + +if sys.version_info >= (3, 0): + compat_getenv = os.getenv + compat_expanduser = os.path.expanduser +else: + # Environment variables should be decoded with filesystem encoding. + # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) + + def compat_getenv(key, default=None): + from .utils import get_filesystem_encoding + env = os.getenv(key, default) + if env: + env = env.decode(get_filesystem_encoding()) + return env + + # HACK: The default implementations of os.path.expanduser from cpython do not decode + # environment variables with filesystem encoding. We will work around this by + # providing adjusted implementations. + # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib + # for different platforms with correct environment variables decoding. + + if os.name == 'posix': + def compat_expanduser(path): + """Expand ~ and ~user constructions. If user or $HOME is unknown, + do nothing.""" + if not path.startswith('~'): + return path + i = path.find('/', 1) + if i < 0: + i = len(path) + if i == 1: + if 'HOME' not in os.environ: + import pwd + userhome = pwd.getpwuid(os.getuid()).pw_dir + else: + userhome = compat_getenv('HOME') + else: + import pwd + try: + pwent = pwd.getpwnam(path[1:i]) + except KeyError: + return path + userhome = pwent.pw_dir + userhome = userhome.rstrip('/') + return (userhome + path[i:]) or '/' + elif os.name == 'nt' or os.name == 'ce': + def compat_expanduser(path): + """Expand ~ and ~user constructs. + + If user or $HOME is unknown, do nothing.""" + if path[:1] != '~': + return path + i, n = 1, len(path) + while i < n and path[i] not in '/\\': + i = i + 1 + + if 'HOME' in os.environ: + userhome = compat_getenv('HOME') + elif 'USERPROFILE' in os.environ: + userhome = compat_getenv('USERPROFILE') + elif not 'HOMEPATH' in os.environ: + return path + else: + try: + drive = compat_getenv('HOMEDRIVE') + except KeyError: + drive = '' + userhome = os.path.join(drive, compat_getenv('HOMEPATH')) + + if i != 1: #~user + userhome = os.path.join(os.path.dirname(userhome), path[1:i]) + + return userhome + path[i:] + else: + compat_expanduser = os.path.expanduser + + +if sys.version_info < (3, 0): + def compat_print(s): + from .utils import preferredencoding + print(s.encode(preferredencoding(), 'xmlcharrefreplace')) +else: + def compat_print(s): + assert type(s) == type(u'') + print(s) + + +try: + subprocess_check_output = subprocess.check_output +except AttributeError: + def subprocess_check_output(*args, **kwargs): + assert 'input' not in kwargs + p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) + output, _ = p.communicate() + ret = p.poll() + if ret: + raise subprocess.CalledProcessError(ret, p.args, output=output) + return output + +if sys.version_info < (3, 0) and sys.platform == 'win32': + def compat_getpass(prompt, *args, **kwargs): + if isinstance(prompt, compat_str): + from .utils import preferredencoding + prompt = prompt.encode(preferredencoding()) + return getpass.getpass(prompt, *args, **kwargs) +else: + compat_getpass = getpass.getpass + + +__all__ = [ + 'compat_HTTPError', + 'compat_chr', + 'compat_cookiejar', + 'compat_expanduser', + 'compat_getenv', + 'compat_getpass', + 'compat_html_entities', + 'compat_html_parser', + 'compat_http_client', + 'compat_ord', + 'compat_parse_qs', + 'compat_print', + 'compat_str', + 'compat_subprocess_get_DEVNULL', + 'compat_urllib_error', + 'compat_urllib_parse', + 'compat_urllib_parse_unquote', + 'compat_urllib_parse_urlparse', + 'compat_urllib_request', + 'compat_urlparse', + 'compat_urlretrieve', + 'compat_xml_parse_error', + 'shlex_quote', + 'subprocess_check_output', +] diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 5eb108302..17d9631fa 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -12,9 +12,15 @@ from ..utils import ( compat_str, encodeFilename, format_bytes, + get_exe_version, ) +def rtmpdump_version(): + return get_exe_version( + 'rtmpdump', ['--help'], r'(?i)RTMPDump\s*v?([0-9a-zA-Z._-]+)') + + class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3f85c99cd..3c1807f15 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -421,6 +421,7 @@ from .vesti import VestiIE from .vevo import VevoIE from .vgtv import VGTVIE from .vh1 import VH1IE +from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index fcf296057..11f149f9e 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -3,12 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_HTTPError, compat_str, compat_urllib_parse, compat_urllib_parse_urlparse, - +) +from ..utils import ( ExtractorError, ) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ad22cbafd..a6920685e 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -14,6 +14,7 @@ from ..utils import ( compat_str, compat_urllib_request, compat_parse_qs, + compat_urllib_parse_urlparse, determine_ext, ExtractorError, @@ -23,7 +24,7 @@ from ..utils import ( class BrightcoveIE(InfoExtractor): - _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)' + _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*?\?(?P<query>.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _TESTS = [ @@ -260,11 +261,19 @@ class BrightcoveIE(InfoExtractor): formats = [] for rend in renditions: url = rend['defaultURL'] + if not url: + continue if rend['remote']: - # This type of renditions are served through akamaihd.net, - # but they don't use f4m manifests - url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' - ext = 'flv' + url_comp = compat_urllib_parse_urlparse(url) + if url_comp.path.endswith('.m3u8'): + formats.extend( + self._extract_m3u8_formats(url, info['id'], 'mp4')) + continue + elif 'akamaihd.net' in url_comp.netloc: + # This type of renditions are served through + # akamaihd.net, but they don't use f4m manifests + url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' + ext = 'flv' else: ext = determine_ext(url) size = rend.get('size') diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 4f000292b..16d800512 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -27,7 +27,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://media.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -43,7 +43,7 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://media.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 'authors': [ 'Mike Wilmot' ], }, } @@ -94,7 +94,7 @@ class Channel9IE(InfoExtractor): def _extract_title(self, html): title = self._html_search_meta('title', html, 'title') - if title is None: + if title is None: title = self._og_search_title(html) TITLE_SUFFIX = ' (Channel 9)' if title is not None and title.endswith(TITLE_SUFFIX): @@ -115,7 +115,7 @@ class Channel9IE(InfoExtractor): return self._html_search_meta('description', html, 'description') def _extract_duration(self, html): - m = re.search(r'data-video_duration="(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) + m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None def _extract_slides(self, html): @@ -167,7 +167,7 @@ class Channel9IE(InfoExtractor): return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) def _extract_content(self, html, content_path): - # Look for downloadable content + # Look for downloadable content formats = self._formats_from_html(html) slides = self._extract_slides(html) zip_ = self._extract_zip(html) @@ -258,16 +258,17 @@ class Channel9IE(InfoExtractor): webpage = self._download_webpage(url, content_path, 'Downloading web page') - page_type_m = re.search(r'<meta name="Search.PageType" content="(?P<pagetype>[^"]+)"/>', webpage) - if page_type_m is None: - raise ExtractorError('Search.PageType not found, don\'t know how to process this page', expected=True) - - page_type = page_type_m.group('pagetype') - if page_type == 'List': # List page, may contain list of 'item'-like objects + page_type_m = re.search(r'<meta name="WT.entryid" content="(?P<pagetype>[^:]+)[^"]+"/>', webpage) + if page_type_m is not None: + page_type = page_type_m.group('pagetype') + if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content + return self._extract_entry_item(webpage, content_path) + elif page_type == 'Session': # Event session page, may contain downloadable content + return self._extract_session(webpage, content_path) + elif page_type == 'Event': + return self._extract_list(content_path) + else: + raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) + + else: # Assuming list return self._extract_list(content_path) - elif page_type == 'Entry.Item': # Any 'item'-like page, may contain downloadable content - return self._extract_entry_item(webpage, content_path) - elif page_type == 'Session': # Event session page, may contain downloadable content - return self._extract_session(webpage, content_path) - else: - raise ExtractorError('Unexpected Search.PageType %s' % page_type, expected=True)
\ No newline at end of file diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index d064a28f9..31fe906b4 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,11 +42,12 @@ class CinemassacreIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P<full_video_id>(?:Cinemassacre-)?(?P<video_id>.+?)))"', webpage) if not mobj: raise ExtractorError('Can\'t extract embed url and video id') playerdata_url = mobj.group('embed_url') video_id = mobj.group('video_id') + full_video_id = mobj.group('full_video_id') video_title = self._html_search_regex( r'<title>(?P<title>.+?)\|', webpage, 'title') @@ -59,41 +60,53 @@ class CinemassacreIE(InfoExtractor): vidurl = self._search_regex( r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') - vidid = self._search_regex( - r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') - videoserver = self._html_search_regex( - r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') - videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) - videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') + videolist_url = None - formats = [] - baseurl = vidurl[:vidurl.rfind('/')+1] - for video in videolist.findall('.//video'): - src = video.get('src') - if not src: - continue - file_ = src.partition(':')[-1] - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - bitrate = int_or_none(video.get('system-bitrate')) - format = { - 'url': baseurl + file_, - 'format_id': src.rpartition('.')[0].rpartition('_')[-1], - } - if width or height: - format.update({ - 'tbr': bitrate // 1000 if bitrate else None, - 'width': width, - 'height': height, - }) - else: - format.update({ - 'abr': bitrate // 1000 if bitrate else None, - 'vcodec': 'none', - }) - formats.append(format) - self._sort_formats(formats) + mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata) + if mobj: + videoserver = mobj.group('videoserver') + mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata) + vidid = mobj.group('vidid') if mobj else full_video_id + videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) + else: + mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata) + if mobj: + videolist_url = mobj.group('smil') + + if videolist_url: + videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') + formats = [] + baseurl = vidurl[:vidurl.rfind('/')+1] + for video in videolist.findall('.//video'): + src = video.get('src') + if not src: + continue + file_ = src.partition(':')[-1] + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + bitrate = int_or_none(video.get('system-bitrate')) + format = { + 'url': baseurl + file_, + 'format_id': src.rpartition('.')[0].rpartition('_')[-1], + } + if width or height: + format.update({ + 'tbr': bitrate // 1000 if bitrate else None, + 'width': width, + 'height': height, + }) + else: + format.update({ + 'abr': bitrate // 1000 if bitrate else None, + 'vcodec': 'none', + }) + formats.append(format) + self._sort_formats(formats) + else: + formats = [{ + 'url': vidurl, + }] return { 'id': video_id, diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 386f080d2..abf8cc280 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -4,14 +4,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, +from ..compat import ( compat_parse_qs, compat_urllib_parse, - remove_end, - HEADRequest, compat_HTTPError, ) +from ..utils import ( + ExtractorError, + HEADRequest, + remove_end, +) class CloudyIE(InfoExtractor): diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 78877b1cf..3826ce7e1 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -16,9 +16,10 @@ class CNNIE(InfoExtractor): _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', - 'file': 'sports_2013_06_09_nadal-1-on-1.cnn.mp4', 'md5': '3e6121ea48df7e2259fe73a0628605c4', 'info_dict': { + 'id': 'sports_2013_06_09_nadal-1-on-1.cnn', + 'ext': 'mp4', 'title': 'Nadal wins 8th French Open title', 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', 'duration': 135, @@ -27,9 +28,10 @@ class CNNIE(InfoExtractor): }, { "url": "http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29", - "file": "us_2013_08_21_sot-student-gives-epic-speech.georgia-institute-of-technology.mp4", "md5": "b5cc60c60a3477d185af8f19a2a26f4e", "info_dict": { + 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'ext': 'mp4', "title": "Student's epic speech stuns new freshmen", "description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", "upload_date": "20130821", diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7e4113213..b77f0e519 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -12,13 +12,14 @@ import sys import time import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, compat_str, - +) +from ..utils import ( clean_html, compiled_regex_type, ExtractorError, @@ -403,7 +404,7 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -424,8 +425,11 @@ class InfoExtractor(object): _name = name if mobj: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + else: + return mobj.group(group) elif default is not _NO_DEFAULT: return default elif fatal: @@ -435,11 +439,11 @@ class InfoExtractor(object): 'please report this issue on http://yt-dl.org/bug' % _name) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ - res = self._search_regex(pattern, string, name, default, fatal, flags) + res = self._search_regex(pattern, string, name, default, fatal, flags, group) if res: return clean_html(res).strip() else: @@ -533,9 +537,9 @@ class InfoExtractor(object): display_name = name return self._html_search_regex( r'''(?ix)<meta - (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?) - [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=fatal, **kwargs) + (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name), + html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index cc612d08e..0bd0eccba 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -17,7 +17,6 @@ from ..utils import ( bytes_to_intlist, intlist_to_bytes, unified_strdate, - clean_html, urlencode_postdata, ) from ..aes import ( diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 5f24ac721..aefca848a 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,7 +5,8 @@ import os.path import re from .common import InfoExtractor -from ..utils import compat_urllib_parse_unquote, url_basename +from ..compat import compat_urllib_parse_unquote +from ..utils import url_basename class DropboxIE(InfoExtractor): diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 3ad993751..104803563 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -5,12 +5,14 @@ import re import socket from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_http_client, compat_str, compat_urllib_error, compat_urllib_parse, compat_urllib_request, +) +from ..utils import ( urlencode_postdata, ExtractorError, limit_length, diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 3d67b9d60..d570e3f6a 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -8,12 +8,11 @@ from ..utils import ( compat_urllib_parse, compat_urlparse, unescapeHTML, - get_meta_content, ) class GameSpotIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?' + _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TEST = { 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -26,10 +25,10 @@ class GameSpotIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('page_id') + page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex(r'data-video=["\'](.*?)["\']', webpage, 'data video') + data_video_json = self._search_regex( + r'data-video=["\'](.*?)["\']', webpage, 'data video') data_video = json.loads(unescapeHTML(data_video_json)) # Transform the manifest url to a link to the mp4 files @@ -41,7 +40,8 @@ class GameSpotIE(InfoExtractor): http_path = f4m_path[1:].split('/', 1)[1] http_template = re.sub(QUALITIES_RE, r'%s', http_path) http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template) + http_template = compat_urlparse.urljoin( + 'http://video.gamespotcdn.com/', http_template) formats = [] for q in qualities: formats.append({ @@ -52,8 +52,9 @@ class GameSpotIE(InfoExtractor): return { 'id': data_video['guid'], + 'display_id': page_id, 'title': compat_urllib_parse.unquote(data_video['title']), 'formats': formats, - 'description': get_meta_content('description', webpage), + 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8abc340b4..01d6a57f8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,11 +7,12 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..utils import ( +from ..compat import ( compat_urllib_parse, compat_urlparse, compat_xml_parse_error, - +) +from ..utils import ( determine_ext, ExtractorError, float_or_none, @@ -99,6 +100,22 @@ class GenericIE(InfoExtractor): 'uploader': 'Championat', }, }, + { + # https://github.com/rg3/youtube-dl/issues/3541 + 'add_ie': ['Brightcove'], + 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', + 'info_dict': { + 'id': '3866516442001', + 'ext': 'mp4', + 'title': 'Leer mij vrouwen kennen: Aflevering 1', + 'description': 'Leer mij vrouwen kennen: Aflevering 1', + 'uploader': 'SBS Broadcasting', + }, + 'skip': 'Restricted to Netherlands', + 'params': { + 'skip_download': True, # m3u8 download + }, + }, # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 77c3ad4fc..66ca37918 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -5,13 +5,15 @@ import random import math from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, +from ..compat import ( compat_str, compat_chr, compat_ord, ) +from ..utils import ( + ExtractorError, + float_or_none, +) class GloboIE(InfoExtractor): diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py index 7bca21ad0..18474cbb7 100644 --- a/youtube_dl/extractor/goshgay.py +++ b/youtube_dl/extractor/goshgay.py @@ -1,15 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( compat_urlparse, - str_to_int, ExtractorError, ) -import json class GoshgayIE(InfoExtractor): @@ -27,36 +23,27 @@ class GoshgayIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'class="video-title"><h1>(.+?)<', webpage, 'title') + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', webpage, default='false') + config_url = self._search_regex( + r"'config'\s*:\s*'([^']+)'", webpage, 'config URL') - player_config = self._search_regex( - r'(?s)jwplayer\("player"\)\.setup\(({.+?})\)', webpage, 'config settings') - player_vars = json.loads(player_config.replace("'", '"')) - width = str_to_int(player_vars.get('width')) - height = str_to_int(player_vars.get('height')) - config_uri = player_vars.get('config') + config = self._download_xml( + config_url, video_id, 'Downloading player config XML') - if config_uri is None: - raise ExtractorError('Missing config URI') - node = self._download_xml(config_uri, video_id, 'Downloading player config XML', - errnote='Unable to download XML') - if node is None: + if config is None: raise ExtractorError('Missing config XML') - if node.tag != 'config': + if config.tag != 'config': raise ExtractorError('Missing config attribute') - fns = node.findall('file') - imgs = node.findall('image') - if len(fns) != 1: + fns = config.findall('file') + if len(fns) < 1: raise ExtractorError('Missing media URI') video_url = fns[0].text - if len(imgs) < 1: - thumbnail = None - else: - thumbnail = imgs[0].text url_comp = compat_urlparse.urlparse(url) ref = "%s://%s%s" % (url_comp[0], url_comp[1], url_comp[2]) @@ -65,9 +52,7 @@ class GoshgayIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': title, - 'width': width, - 'height': height, 'thumbnail': thumbnail, 'http_referer': ref, - 'age_limit': 18, + 'age_limit': 0 if family_friendly == 'true' else 18, } diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index d41c0413f..278d9f527 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - get_meta_content, + determine_ext, int_or_none, parse_iso8601, ) @@ -25,11 +25,11 @@ class HeiseIE(InfoExtractor): 'title': ( "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" ), - 'format_id': 'mp4_720', + 'format_id': 'mp4_720p', 'timestamp': 1411812600, 'upload_date': '20140927', 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', - 'thumbnail': 're:https?://.*\.jpg$', + 'thumbnail': 're:^https?://.*\.jpe?g$', } } @@ -49,11 +49,12 @@ class HeiseIE(InfoExtractor): info = { 'id': video_id, 'thumbnail': self._og_search_thumbnail(webpage), - 'timestamp': parse_iso8601(get_meta_content('date', webpage)), + 'timestamp': parse_iso8601( + self._html_search_meta('date', webpage)), 'description': self._og_search_description(webpage), } - title = get_meta_content('fulltitle', webpage) + title = self._html_search_meta('fulltitle', webpage) if title: info['title'] = title else: @@ -64,9 +65,12 @@ class HeiseIE(InfoExtractor): label = source_node.attrib['label'] height = int_or_none(self._search_regex( r'^(.*?_)?([0-9]+)p$', label, 'height', default=None)) + video_url = source_node.attrib['file'] + ext = determine_ext(video_url, '') formats.append({ - 'url': source_node.attrib['file'], + 'url': video_url, 'format_note': label, + 'format_id': '%s_%s' % (ext, label), 'height': height, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4536db3bf..6108ed552 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -6,7 +6,6 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, - get_element_by_attribute, ) @@ -27,10 +26,11 @@ class ImdbIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id) - descr = get_element_by_attribute('itemprop', 'description', webpage) + descr = self._html_search_regex( + r'(?s)<span itemprop="description">(.*?)</span>', + webpage, 'description', fatal=False) available_formats = re.findall( r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage, flags=re.MULTILINE) @@ -73,9 +73,7 @@ class ImdbListIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - list_id = mobj.group('id') - + list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) entries = [ self.url_result('http://www.imdb.com' + m, 'Imdb') diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 07ef682ee..d16d483ee 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -5,11 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - get_element_by_id, - parse_iso8601, determine_ext, - int_or_none, float_or_none, + get_element_by_id, + int_or_none, + parse_iso8601, str_to_int, ) @@ -30,7 +30,7 @@ class IzleseneIE(InfoExtractor): 'description': 'md5:253753e2655dde93f59f74b572454f6d', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'pelikzzle', - 'timestamp': 1404298698, + 'timestamp': 1404302298, 'upload_date': '20140702', 'duration': 95.395, 'age_limit': 0, @@ -46,7 +46,7 @@ class IzleseneIE(InfoExtractor): 'description': 'Tarkan Dortmund 2006 Konseri', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'parlayankiz', - 'timestamp': 1163318593, + 'timestamp': 1163322193, 'upload_date': '20061112', 'duration': 253.666, 'age_limit': 0, @@ -55,10 +55,9 @@ class IzleseneIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.izlesene.com/video/%s' % video_id + video_id = self._match_id(url) + url = 'http://www.izlesene.com/video/%s' % video_id webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 263f68773..102e29f7a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -4,6 +4,7 @@ import random import re from .common import InfoExtractor +from ..utils import ExtractorError class Laola1TvIE(InfoExtractor): diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index ccb5959c4..a89153985 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -7,11 +7,12 @@ import re import json from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_ord, compat_urllib_parse, compat_urllib_request, - +) +from ..utils import ( ExtractorError, ) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 62d5707fe..45cbd4ee9 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -12,6 +12,7 @@ from ..utils import ( unified_strdate, parse_duration, int_or_none, + ExtractorError, ) @@ -108,6 +109,9 @@ class NiconicoIE(InfoExtractor): flv_info_request, video_id, note='Downloading flv info', errnote='Unable to download flv info') + if 'deleted=' in flv_info_webpage: + raise ExtractorError('The video has been deleted.', + expected=True) video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 645a1e06d..17880471d 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -6,6 +6,7 @@ import os.path from .common import InfoExtractor from ..utils import ( + ExtractorError, compat_urllib_parse, compat_urllib_request, ) @@ -29,6 +30,12 @@ class PlayedIE(InfoExtractor): video_id = self._match_id(url) orig_webpage = self._download_webpage(url, video_id) + + m_error = re.search( + r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage) + if m_error: + raise ExtractorError(m_error.group('msg'), expected=True) + fields = re.findall( r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) data = dict(fields) diff --git a/youtube_dl/extractor/ro220.py b/youtube_dl/extractor/ro220.py index 0a3a71448..962b524e9 100644 --- a/youtube_dl/extractor/ro220.py +++ b/youtube_dl/extractor/ro220.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import compat_urllib_parse_unquote +from ..compat import compat_urllib_parse_unquote class Ro220IE(InfoExtractor): diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py index a73f3c43a..e7b79243a 100644 --- a/youtube_dl/extractor/trutube.py +++ b/youtube_dl/extractor/trutube.py @@ -29,7 +29,7 @@ class TruTubeIE(InfoExtractor): # filehd is always 404 video_url = xpath_text(config, './file', 'video URL', fatal=True) - title = xpath_text(config, './title', 'title') + title = xpath_text(config, './title', 'title').strip() thumbnail = xpath_text(config, './image', ' thumbnail') return { diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index cee1ea8f6..875450908 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( compat_urlparse, - get_meta_content, ) @@ -79,7 +78,7 @@ class UstreamChannelIE(InfoExtractor): m = re.match(self._VALID_URL, url) display_id = m.group('slug') webpage = self._download_webpage(url, display_id) - channel_id = get_meta_content('ustream:channel_id', webpage) + channel_id = self._html_search_meta('ustream:channel_id', webpage) BASE = 'http://www.ustream.tv' next_url = '/ajax/socialstream/videos/%s/1.json' % channel_id diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py new file mode 100644 index 000000000..f11ca8217 --- /dev/null +++ b/youtube_dl/extractor/vice.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import ExtractorError + + +class ViceIE(InfoExtractor): + _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)' + + _TEST = { + 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', + 'info_dict': { + 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'ext': 'mp4', + 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + }, + 'params': { + # Requires ffmpeg (m3u8 manifest) + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + try: + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', webpage, + 'ooyala embed code') + ooyala_url = OoyalaIE._url_for_embed_code(embed_code) + print(ooyala_url) + except ExtractorError: + raise ExtractorError('The page doesn\'t contain a video', expected=True) + return self.url_result(ooyala_url, ie='Ooyala') + diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d9cad0ea5..c744d4f04 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -7,11 +7,13 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor -from ..utils import ( +from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urllib_request, compat_urlparse, +) +from ..utils import ( ExtractorError, InAdvancePagedList, int_or_none, diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 3377a543e..d6dec25ca 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -37,7 +37,7 @@ class WimpIE(InfoExtractor): video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r"'file'\s*:\s*'([^']+)'", webpage, 'video URL') + r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL') if YoutubeIE.suitable(video_url): self.to_screen('Found YouTube video') return { diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index aad8ffbf4..c77d4056f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -684,7 +684,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video info self.report_video_info_webpage_download(video_id) if re.search(r'player-age-gate-content">', video_webpage) is not None: - self.report_age_confirmation() age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube @@ -692,12 +691,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'), + r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) else: age_gate = False diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 98e20d549..997e92ad7 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -5,9 +5,11 @@ import optparse import shlex import sys -from .utils import ( +from .compat import ( compat_expanduser, compat_getenv, +) +from .utils import ( get_term_width, write_string, ) diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py index 765b2d9ee..448ccc5f3 100644 --- a/youtube_dl/postprocessor/atomicparsley.py +++ b/youtube_dl/postprocessor/atomicparsley.py @@ -6,10 +6,11 @@ import os import subprocess from .common import PostProcessor - +from ..compat import ( + compat_urlretrieve, +) from ..utils import ( check_executable, - compat_urlretrieve, encodeFilename, PostProcessingError, prepend_extension, diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 08419a3d4..baf1b1945 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -3,10 +3,8 @@ from __future__ import unicode_literals import subprocess from .common import PostProcessor -from ..utils import ( - shlex_quote, - PostProcessingError, -) +from ..compat import shlex_quote +from ..utils import PostProcessingError class ExecAfterDownloadPP(PostProcessor): diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 083c79592..f3f2743c0 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -1,5 +1,4 @@ import os -import re import subprocess import sys import time @@ -7,10 +6,13 @@ import time from .common import AudioConversionError, PostProcessor -from ..utils import ( +from ..compat import ( compat_subprocess_get_DEVNULL, +) +from ..utils import ( encodeArgument, encodeFilename, + get_exe_version, is_outdated_version, PostProcessingError, prepend_extension, @@ -19,23 +21,6 @@ from ..utils import ( ) -def get_version(executable): - """ Returns the version of the specified executable, - or False if the executable is not present """ - try: - out, err = subprocess.Popen( - [executable, '-version'], - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() - except OSError: - return False - firstline = out.partition(b'\n')[0].decode('ascii', 'ignore') - m = re.search(r'version\s+([0-9._-a-zA-Z]+)', firstline) - if not m: - return u'present' - else: - return m.group(1) - - class FFmpegPostProcessorError(PostProcessingError): pass @@ -61,7 +46,7 @@ class FFmpegPostProcessor(PostProcessor): @staticmethod def get_versions(): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - return dict((program, get_version(program)) for program in programs) + return dict((p, get_exe_version(p, args=['-version'])) for p in programs) @property def _executable(self): diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index f6940940b..b5cae41c8 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -3,10 +3,12 @@ import subprocess import sys from .common import PostProcessor +from ..compat import ( + subprocess_check_output +) from ..utils import ( check_executable, hyphenate_date, - subprocess_check_output ) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2864e5142..7c0fb1592 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +from __future__ import unicode_literals + import calendar import codecs import contextlib @@ -8,7 +10,6 @@ import ctypes import datetime import email.utils import errno -import getpass import gzip import itertools import io @@ -29,254 +30,19 @@ import traceback import xml.etree.ElementTree import zlib -try: - import urllib.request as compat_urllib_request -except ImportError: # Python 2 - import urllib2 as compat_urllib_request - -try: - import urllib.error as compat_urllib_error -except ImportError: # Python 2 - import urllib2 as compat_urllib_error - -try: - import urllib.parse as compat_urllib_parse -except ImportError: # Python 2 - import urllib as compat_urllib_parse - -try: - from urllib.parse import urlparse as compat_urllib_parse_urlparse -except ImportError: # Python 2 - from urlparse import urlparse as compat_urllib_parse_urlparse - -try: - import urllib.parse as compat_urlparse -except ImportError: # Python 2 - import urlparse as compat_urlparse - -try: - import http.cookiejar as compat_cookiejar -except ImportError: # Python 2 - import cookielib as compat_cookiejar - -try: - import html.entities as compat_html_entities -except ImportError: # Python 2 - import htmlentitydefs as compat_html_entities - -try: - import html.parser as compat_html_parser -except ImportError: # Python 2 - import HTMLParser as compat_html_parser - -try: - import http.client as compat_http_client -except ImportError: # Python 2 - import httplib as compat_http_client - -try: - from urllib.error import HTTPError as compat_HTTPError -except ImportError: # Python 2 - from urllib2 import HTTPError as compat_HTTPError - -try: - from urllib.request import urlretrieve as compat_urlretrieve -except ImportError: # Python 2 - from urllib import urlretrieve as compat_urlretrieve - - -try: - from subprocess import DEVNULL - compat_subprocess_get_DEVNULL = lambda: DEVNULL -except ImportError: - compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w') - -try: - from urllib.parse import unquote as compat_urllib_parse_unquote -except ImportError: - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - if string == '': - return string - res = string.split('%') - if len(res) == 1: - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string - - -try: - from urllib.parse import parse_qs as compat_parse_qs -except ImportError: # Python 2 - # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib. - # Python 2's version is apparently totally broken - - def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - qs, _coerce_result = qs, unicode - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value and not strict_parsing: - continue - nv = name_value.split('=', 1) - if len(nv) != 2: - if strict_parsing: - raise ValueError("bad query field: %r" % (name_value,)) - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = compat_urllib_parse_unquote( - name, encoding=encoding, errors=errors) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = compat_urllib_parse_unquote( - value, encoding=encoding, errors=errors) - value = _coerce_result(value) - r.append((name, value)) - return r - - def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False, - encoding='utf-8', errors='replace'): - parsed_result = {} - pairs = _parse_qsl(qs, keep_blank_values, strict_parsing, - encoding=encoding, errors=errors) - for name, value in pairs: - if name in parsed_result: - parsed_result[name].append(value) - else: - parsed_result[name] = [value] - return parsed_result - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - -try: - compat_chr = unichr # Python 2 -except NameError: - compat_chr = chr - -try: - from xml.etree.ElementTree import ParseError as compat_xml_parse_error -except ImportError: # Python 2.6 - from xml.parsers.expat import ExpatError as compat_xml_parse_error - -try: - from shlex import quote as shlex_quote -except ImportError: # Python < 3.3 - def shlex_quote(s): - return "'" + s.replace("'", "'\"'\"'") + "'" - - -def compat_ord(c): - if type(c) is int: return c - else: return ord(c) - - -if sys.version_info >= (3, 0): - compat_getenv = os.getenv - compat_expanduser = os.path.expanduser -else: - # Environment variables should be decoded with filesystem encoding. - # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) - - def compat_getenv(key, default=None): - env = os.getenv(key, default) - if env: - env = env.decode(get_filesystem_encoding()) - return env - - # HACK: The default implementations of os.path.expanduser from cpython do not decode - # environment variables with filesystem encoding. We will work around this by - # providing adjusted implementations. - # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib - # for different platforms with correct environment variables decoding. - - if os.name == 'posix': - def compat_expanduser(path): - """Expand ~ and ~user constructions. If user or $HOME is unknown, - do nothing.""" - if not path.startswith('~'): - return path - i = path.find('/', 1) - if i < 0: - i = len(path) - if i == 1: - if 'HOME' not in os.environ: - import pwd - userhome = pwd.getpwuid(os.getuid()).pw_dir - else: - userhome = compat_getenv('HOME') - else: - import pwd - try: - pwent = pwd.getpwnam(path[1:i]) - except KeyError: - return path - userhome = pwent.pw_dir - userhome = userhome.rstrip('/') - return (userhome + path[i:]) or '/' - elif os.name == 'nt' or os.name == 'ce': - def compat_expanduser(path): - """Expand ~ and ~user constructs. - - If user or $HOME is unknown, do nothing.""" - if path[:1] != '~': - return path - i, n = 1, len(path) - while i < n and path[i] not in '/\\': - i = i + 1 - - if 'HOME' in os.environ: - userhome = compat_getenv('HOME') - elif 'USERPROFILE' in os.environ: - userhome = compat_getenv('USERPROFILE') - elif not 'HOMEPATH' in os.environ: - return path - else: - try: - drive = compat_getenv('HOMEDRIVE') - except KeyError: - drive = '' - userhome = os.path.join(drive, compat_getenv('HOMEPATH')) - - if i != 1: #~user - userhome = os.path.join(os.path.dirname(userhome), path[1:i]) - - return userhome + path[i:] - else: - compat_expanduser = os.path.expanduser +from .compat import ( + compat_chr, + compat_getenv, + compat_html_entities, + compat_html_parser, + compat_parse_qs, + compat_str, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_parse_urlparse, + compat_urllib_request, + compat_urlparse, +) # This is not clearly defined otherwise @@ -304,14 +70,6 @@ def preferredencoding(): return pref -if sys.version_info < (3,0): - def compat_print(s): - print(s.encode(preferredencoding(), 'xmlcharrefreplace')) -else: - def compat_print(s): - assert type(s) == type(u'') - print(s) - def write_json_file(obj, fn): """ Encode obj as JSON and write it to fn, atomically """ @@ -394,127 +152,32 @@ def xpath_text(node, xpath, name=None, fatal=False): return n.text -compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class BaseHTMLParser(compat_html_parser.HTMLParser): - def __init(self): - compat_html_parser.HTMLParser.__init__(self) - self.html = None - - def loads(self, html): - self.html = html - self.feed(html) - self.close() - -class AttrParser(BaseHTMLParser): - """Modified HTMLParser that isolates a tag with the specified attribute""" - def __init__(self, attribute, value): - self.attribute = attribute - self.value = value - self.result = None - self.started = False - self.depth = {} - self.watch_startpos = False - self.error_count = 0 - BaseHTMLParser.__init__(self) - - def error(self, message): - if self.error_count > 10 or self.started: - raise compat_html_parser.HTMLParseError(message, self.getpos()) - self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line - self.error_count += 1 - self.goahead(1) - - def handle_starttag(self, tag, attrs): - attrs = dict(attrs) - if self.started: - self.find_startpos(None) - if self.attribute in attrs and attrs[self.attribute] == self.value: - self.result = [tag] - self.started = True - self.watch_startpos = True - if self.started: - if not tag in self.depth: self.depth[tag] = 0 - self.depth[tag] += 1 - - def handle_endtag(self, tag): - if self.started: - if tag in self.depth: self.depth[tag] -= 1 - if self.depth[self.result[0]] == 0: - self.started = False - self.result.append(self.getpos()) - - def find_startpos(self, x): - """Needed to put the start position of the result (self.result[1]) - after the opening tag with the requested id""" - if self.watch_startpos: - self.watch_startpos = False - self.result.append(self.getpos()) - handle_entityref = handle_charref = handle_data = handle_comment = \ - handle_decl = handle_pi = unknown_decl = find_startpos - - def get_result(self): - if self.result is None: - return None - if len(self.result) != 3: - return None - lines = self.html.split('\n') - lines = lines[self.result[1][0]-1:self.result[2][0]] - lines[0] = lines[0][self.result[1][1]:] - if len(lines) == 1: - lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] - lines[-1] = lines[-1][:self.result[2][1]] - return '\n'.join(lines).strip() -# Hack for https://github.com/rg3/youtube-dl/issues/662 -if sys.version_info < (2, 7, 3): - AttrParser.parse_endtag = (lambda self, i: - i + len("</scr'+'ipt>") - if self.rawdata[i:].startswith("</scr'+'ipt>") - else compat_html_parser.HTMLParser.parse_endtag(self, i)) - def get_element_by_id(id, html): """Return the content of the tag with the specified ID in the passed HTML document""" return get_element_by_attribute("id", id, html) + def get_element_by_attribute(attribute, value, html): """Return the content of the tag with the specified attribute in the passed HTML document""" - parser = AttrParser(attribute, value) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() -class MetaParser(BaseHTMLParser): - """ - Modified HTMLParser that isolates a meta tag with the specified name - attribute. - """ - def __init__(self, name): - BaseHTMLParser.__init__(self) - self.name = name - self.content = None - self.result = None - - def handle_starttag(self, tag, attrs): - if tag != 'meta': - return - attrs = dict(attrs) - if attrs.get('name') == self.name: - self.result = attrs.get('content') + m = re.search(r'''(?xs) + <([a-zA-Z0-9:._-]+) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + \s+%s=['"]?%s['"]? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + \s*> + (?P<content>.*?) + </\1> + ''' % (re.escape(attribute), re.escape(value)), html) - def get_result(self): - return self.result + if not m: + return None + res = m.group('content') -def get_meta_content(name, html): - """ - Return the content attribute from the meta tag with the given name attribute. - """ - parser = MetaParser(name) - try: - parser.loads(html) - except compat_html_parser.HTMLParseError: - pass - return parser.get_result() + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] + + return unescapeHTML(res) def clean_html(html): @@ -1472,6 +1135,25 @@ def check_executable(exe, args=[]): return exe +def get_exe_version(exe, args=['--version'], + version_re=r'version\s+([0-9._-a-zA-Z]+)', + unrecognized=u'present'): + """ Returns the version of the specified executable, + or False if the executable is not present """ + try: + out, err = subprocess.Popen( + [exe] + args, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() + except OSError: + return False + firstline = out.partition(b'\n')[0].decode('ascii', 'ignore') + m = re.search(version_re, firstline) + if m: + return m.group(1) + else: + return unrecognized + + class PagedList(object): def __len__(self): # This is only useful for tests @@ -1562,7 +1244,7 @@ def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" if sys.version_info < (3, 0) and isinstance(s, unicode): s = s.encode('utf-8') - return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") + return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") def escape_url(url): @@ -1636,15 +1318,6 @@ def parse_xml(s): return tree -if sys.version_info < (3, 0) and sys.platform == 'win32': - def compat_getpass(prompt, *args, **kwargs): - if isinstance(prompt, compat_str): - prompt = prompt.encode(preferredencoding()) - return getpass.getpass(prompt, *args, **kwargs) -else: - compat_getpass = getpass.getpass - - US_RATINGS = { 'G': 0, 'PG': 10, @@ -1702,18 +1375,6 @@ def qualities(quality_ids): DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s' -try: - subprocess_check_output = subprocess.check_output -except AttributeError: - def subprocess_check_output(*args, **kwargs): - assert 'input' not in kwargs - p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) - output, _ = p.communicate() - ret = p.poll() - if ret: - raise subprocess.CalledProcessError(ret, p.args, output=output) - return output - def limit_length(s, length): """ Add ellipses to overly long strings """ diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 87d56c164..07cdb22ad 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.10.30' +__version__ = '2014.11.04' |