diff options
-rw-r--r-- | test/test_all_urls.py | 8 | ||||
-rw-r--r-- | youtube_dl/FileDownloader.py | 20 | ||||
-rw-r--r-- | youtube_dl/YoutubeDL.py | 22 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/comedycentral.py | 33 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/viki.py | 22 | ||||
-rw-r--r-- | youtube_dl/extractor/zdf.py | 124 | ||||
-rw-r--r-- | youtube_dl/utils.py | 21 |
9 files changed, 171 insertions, 82 deletions
diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 42813da1a..1f1adb6b4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -101,10 +101,10 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentral']) - self.assertMatch(':tds', ['ComedyCentral']) - self.assertMatch(':colbertreport', ['ComedyCentral']) - self.assertMatch(':cr', ['ComedyCentral']) + self.assertMatch(':thedailyshow', ['ComedyCentralShows']) + self.assertMatch(':tds', ['ComedyCentralShows']) + self.assertMatch(':colbertreport', ['ComedyCentralShows']) + self.assertMatch(':cr', ['ComedyCentralShows']) if __name__ == '__main__': diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index e5a542ed5..2b4fb0b31 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -1,4 +1,3 @@ -import math import os import re import subprocess @@ -11,6 +10,7 @@ from .utils import ( ContentTooShortError, determine_ext, encodeFilename, + format_bytes, sanitize_open, timeconvert, ) @@ -54,20 +54,6 @@ class FileDownloader(object): self.params = params @staticmethod - def format_bytes(bytes): - if bytes is None: - return 'N/A' - if type(bytes) is str: - bytes = float(bytes) - if bytes == 0.0: - exponent = 0 - else: - exponent = int(math.log(bytes, 1024.0)) - suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent] - converted = float(bytes) / float(1024 ** exponent) - return '%.2f%s' % (converted, suffix) - - @staticmethod def format_seconds(seconds): (mins, secs) = divmod(seconds, 60) (hours, mins) = divmod(mins, 60) @@ -117,7 +103,7 @@ class FileDownloader(object): def format_speed(speed): if speed is None: return '%10s' % '---b/s' - return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed)) + return '%10s' % ('%s/s' % format_bytes(speed)) @staticmethod def best_block_size(elapsed_time, bytes): @@ -525,7 +511,7 @@ class FileDownloader(object): self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False - data_len_str = self.format_bytes(data_len) + data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 87eb1a0b3..30ba94666 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -33,6 +33,7 @@ from .utils import ( DownloadError, encodeFilename, ExtractorError, + format_bytes, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -882,9 +883,9 @@ class YoutubeDL(object): def list_formats(self, info_dict): def format_note(fdict): - if fdict.get('format_note') is not None: - return fdict['format_note'] res = u'' + if fdict.get('format_note') is not None: + res += fdict['format_note'] + u' ' if fdict.get('vcodec') is not None: res += u'%-5s' % fdict['vcodec'] elif fdict.get('vbr') is not None: @@ -901,26 +902,31 @@ class YoutubeDL(object): res += 'audio' if fdict.get('abr') is not None: res += u'@%3dk' % fdict['abr'] + if fdict.get('filesize') is not None: + if res: + res += u', ' + res += format_bytes(fdict['filesize']) return res - def line(format): - return (u'%-20s%-10s%-12s%s' % ( + def line(format, idlen=20): + return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % ( format['format_id'], format['ext'], self.format_resolution(format), format_note(format), - ) - ) + )) formats = info_dict.get('formats', [info_dict]) - formats_s = list(map(line, formats)) + idlen = max(len(u'format code'), + max(len(f['format_id']) for f in formats)) + formats_s = [line(f, idlen) for f in formats] if len(formats) > 1: formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': u'format code', 'ext': u'extension', - '_resolution': u'resolution', 'format_note': u'note'}) + '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen) self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % (info_dict['id'], header_line, u"\n".join(formats_s))) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1fbd10bc5..0b4d086b7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,7 @@ from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE -from .comedycentral import ComedyCentralIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .condenast import CondeNastIE from .criterion import CriterionIE from .cspan import CSpanIE diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 69b2beece..725849d2e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -2,6 +2,7 @@ import re import xml.etree.ElementTree from .common import InfoExtractor +from .mtv import MTVIE, _media_xml_tag from ..utils import ( compat_str, compat_urllib_parse, @@ -11,7 +12,37 @@ from ..utils import ( ) -class ComedyCentralIE(InfoExtractor): +class ComedyCentralIE(MTVIE): + _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' + _FEED_URL = u'http://comedycentral.com/feeds/mrss/' + + _TEST = { + u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + u'md5': u'4167875aae411f903b751a21f357f1ee', + u'info_dict': { + u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', + u'ext': u'mp4', + u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother', + u'description': u'After a certain point, breastfeeding becomes c**kblocking.', + }, + } + # Overwrite MTVIE properties we don't want + _TESTS = [] + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', + webpage, u'mgid') + return self._get_videos_info(mgid) + + +class ComedyCentralShowsIE(InfoExtractor): IE_DESC = u'The Daily Show / Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6ec835f8a..5656445a3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -76,6 +76,7 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * filesize The number of bytes, if known in advance webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 78d03c079..cd986a749 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,6 +1,7 @@ import re from ..utils import ( + ExtractorError, unified_strdate, ) from .subtitles import SubtitlesInfoExtractor @@ -20,7 +21,8 @@ class VikiIE(SubtitlesInfoExtractor): u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', u'upload_date': u'20131121', u'age_limit': 13, - } + }, + u'skip': u'Blocked in the US', } def _real_extract(self, url): @@ -32,11 +34,12 @@ class VikiIE(SubtitlesInfoExtractor): description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage, - u'uploader') - if uploader is not None: - uploader = uploader.strip() + uploader_m = re.search( + r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage) + if uploader_m is None: + uploader = None + else: + uploader = uploader.group(1).strip() rating_str = self._html_search_regex( r'<strong>Rating: </strong>\s*([^<]*)<', webpage, @@ -51,7 +54,12 @@ class VikiIE(SubtitlesInfoExtractor): age_limit = RATINGS.get(rating_str) info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id - info_webpage = self._download_webpage(info_url, video_id) + info_webpage = self._download_webpage( + info_url, video_id, note=u'Downloading info page') + if re.match(r'\s*<div\s+class="video-error', info_webpage): + raise ExtractorError( + u'Video %s is blocked from your location.' % video_id, + expected=True) video_url = self._html_search_regex( r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c6a9d06f2..07f830e80 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,75 +1,111 @@ +import operator import re from .common import InfoExtractor from ..utils import ( - determine_ext, - ExtractorError, + parse_xml_doc, + unified_strdate, ) class ZDFIE(InfoExtractor): _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('video_id') - if mobj.group('hash'): - url = url.replace(u'#', u'', 1) + xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + info_xml = self._download_webpage( + xml_url, video_id, note=u'Downloading video info') + doc = parse_xml_doc(info_xml) - html = self._download_webpage(url, video_id) - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if streams is None: - raise ExtractorError(u'No media url found.') + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + uploader_node = doc.find('.//details/originChannelTitle') + uploader = None if uploader_node is None else uploader_node.text + duration_str = doc.find('.//details/length').text + duration_m = re.match(r'''(?x)^ + (?P<hours>[0-9]{2}) + :(?P<minutes>[0-9]{2}) + :(?P<seconds>[0-9]{2}) + (?:\.(?P<ms>[0-9]+)?) + ''', duration_str) + duration = ( + ( + (int(duration_m.group('hours')) * 60 * 60) + + (int(duration_m.group('minutes')) * 60) + + int(duration_m.group('seconds')) + ) + if duration_m + else None + ) + upload_date = unified_strdate(doc.find('.//details/airtime').text) - # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url - # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url - # choose first/default media type and highest quality for now - def stream_pref(s): - TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming'] + def xml_to_format(fnode): + video_url = fnode.find('url').text + is_available = u'http://www.metafilegenerator' not in video_url + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_ + (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+) + ''', format_id) + + ext = format_m.group('container') + is_supported = ext != 'f4f' + + PROTO_ORDER = ['http', 'rtmp', 'rtsp'] try: - type_pref = TYPE_ORDER.index(s['media_type']) + proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - type_pref = 999 + proto_pref = 999 - QUALITY_ORDER = ['veryhigh', '300'] + quality = fnode.find('./quality').text + QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: - quality_pref = QUALITY_ORDER.index(s['quality']) + quality_pref = -QUALITY_ORDER.index(quality) except ValueError: quality_pref = 999 - return (type_pref, quality_pref) - - sorted_streams = sorted(streams, key=stream_pref) - if not sorted_streams: - raise ExtractorError(u'No stream found.') - stream = sorted_streams[0] - - media_link = self._download_webpage( - stream['video_url'], - video_id, - u'Get stream URL') + abr = int(fnode.find('./audioBitrate').text) // 1000 + vbr = int(fnode.find('./videoBitrate').text) // 1000 + pref = (is_available, is_supported, + proto_pref, quality_pref, vbr, abr) - #MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' - RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + format_note = u'' + if not is_supported: + format_note += u'(unsupported)' + if not format_note: + format_note = None - mobj = re.search(self._MEDIA_STREAM, media_link) - if mobj is None: - mobj = re.search(RTSP_STREAM, media_link) - if mobj is None: - raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') - video_url = mobj.group('video_url') + return { + 'format_id': format_id + u'-' + quality, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': int(fnode.find('./width').text), + 'height': int(fnode.find('./height').text), + 'filesize': int(fnode.find('./filesize').text), + 'format_note': format_note, + '_pref': pref, + '_available': is_available, + } - title = self._html_search_regex( - r'<h1(?: class="beitragHeadline")?>(.*?)</h1>', - html, u'title') + format_nodes = doc.findall('.//formitaeten/formitaet') + formats = sorted(filter(lambda f: f['_available'], + map(xml_to_format, format_nodes)), + key=operator.itemgetter('_pref')) return { 'id': video_id, - 'url': video_url, 'title': title, - 'ext': determine_ext(video_url) + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'duration': duration, + 'upload_date': upload_date, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 317aee2b5..caec00e37 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -8,6 +8,7 @@ import gzip import io import json import locale +import math import os import pipes import platform @@ -16,6 +17,7 @@ import ssl import socket import sys import traceback +import xml.etree.ElementTree import zlib try: @@ -1005,3 +1007,22 @@ def unsmuggle_url(smug_url): jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] data = json.loads(jsond) return url, data + + +def parse_xml_doc(s): + assert isinstance(s, type(u'')) + return xml.etree.ElementTree.fromstring(s.encode('utf-8')) + + +def format_bytes(bytes): + if bytes is None: + return u'N/A' + if type(bytes) is str: + bytes = float(bytes) + if bytes == 0.0: + exponent = 0 + else: + exponent = int(math.log(bytes, 1024.0)) + suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent] + converted = float(bytes) / float(1024 ** exponent) + return u'%.2f%s' % (converted, suffix) |