diff options
Diffstat (limited to 'youtube_dl')
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 19 | ||||
-rw-r--r-- | youtube_dl/__init__.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/bild.py | 39 | ||||
-rw-r--r-- | youtube_dl/extractor/cinemassacre.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 11 | ||||
-rw-r--r-- | youtube_dl/extractor/francetv.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/funnyordie.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/glide.py | 40 | ||||
-rw-r--r-- | youtube_dl/extractor/hark.py | 48 | ||||
-rw-r--r-- | youtube_dl/extractor/mitele.py | 13 | ||||
-rw-r--r-- | youtube_dl/extractor/motherless.py | 56 | ||||
-rw-r--r-- | youtube_dl/extractor/pbs.py | 20 | ||||
-rw-r--r-- | youtube_dl/extractor/telecinco.py | 19 | ||||
-rw-r--r-- | youtube_dl/extractor/viddler.py | 108 | ||||
-rw-r--r-- | youtube_dl/extractor/vidzi.py | 33 | ||||
-rw-r--r-- | youtube_dl/options.py | 9 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
20 files changed, 366 insertions, 111 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dec0e20e7..75461f19d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -107,6 +107,8 @@ class YoutubeDL(object): forcefilename: Force printing final filename. forceduration: Force printing duration. forcejson: Force printing info_dict as JSON. + dump_single_json: Force printing the info_dict of the whole playlist + (or video) as a single JSON line. simulate: Do not download the video files. format: Video format code. format_limit: Highest quality format to try. @@ -165,6 +167,8 @@ class YoutubeDL(object): 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Do not resolve URLs, return the immediate result. + Pass in 'in_playlist' to only show this behavior for + playlist items. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -568,8 +572,12 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') - if self.params.get('extract_flat', False): - if result_type in ('url', 'url_transparent'): + if result_type in ('url', 'url_transparent'): + extract_flat = self.params.get('extract_flat', False) + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or + extract_flat is True): + if self.params.get('forcejson', False): + self.to_stdout(json.dumps(ie_result)) return ie_result if result_type == 'video': @@ -897,6 +905,8 @@ class YoutubeDL(object): if self.params.get('forcejson', False): info_dict['_filename'] = filename self.to_stdout(json.dumps(info_dict)) + if self.params.get('dump_single_json', False): + info_dict['_filename'] = filename # Do nothing else if in simulate mode if self.params.get('simulate', False): @@ -1064,12 +1074,15 @@ class YoutubeDL(object): for url in url_list: try: #It also downloads the videos - self.extract_info(url) + res = self.extract_info(url) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: self.to_screen('[info] Maximum number of downloaded files reached.') raise + else: + if self.params.get('dump_single_json', False): + self.to_stdout(json.dumps(res)) return self._download_retcode diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 78cdf14df..4f5ce604f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -79,6 +79,9 @@ __authors__ = ( 'Carlos Ramos', '5moufl', 'lenaten', + 'Dennis Scheiba', + 'Damon Timm', + 'winwon', 'Xavier Beynon' ) @@ -256,8 +259,6 @@ def _real_main(argv=None): date = DateRange.day(opts.date) else: date = DateRange(opts.dateafter, opts.datebefore) - if opts.default_search not in ('auto', 'auto_warning', 'error', 'fixup_error', None) and ':' not in opts.default_search: - parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') # Do not download videos when there are audio-only formats if opts.extractaudio and not opts.keepvideo and opts.format is None: @@ -285,7 +286,7 @@ def _real_main(argv=None): u' file! Use "{0}.%(ext)s" instead of "{0}" as the output' u' template'.format(outtmpl)) - any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson + any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json download_archive_fn = os.path.expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive ydl_opts = { @@ -305,6 +306,7 @@ def _real_main(argv=None): 'forcefilename': opts.getfilename, 'forceformat': opts.getformat, 'forcejson': opts.dumpjson, + 'dump_single_json': opts.dump_single_json, 'simulate': opts.simulate, 'skip_download': (opts.skip_download or opts.simulate or any_printing), 'format': opts.format, @@ -370,6 +372,7 @@ def _real_main(argv=None): 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, 'exec_cmd': opts.exec_cmd, + 'extract_flat': opts.extract_flat, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 29f32cdef..691fef5ca 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -27,6 +27,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE @@ -135,6 +136,7 @@ from .gamestar import GameStarIE from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE +from .glide import GlideIE from .globo import GloboIE from .godtube import GodTubeIE from .golem import GolemIE @@ -368,6 +370,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .telecinco import TelecincoIE from .telemb import TeleMBIE from .tenplay import TenPlayIE from .testurl import TestURLIE @@ -422,6 +425,7 @@ from .videopremium import VideoPremiumIE from .videott import VideoTtIE from .videoweed import VideoWeedIE from .vidme import VidmeIE +from .vidzi import VidziIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -490,10 +494,8 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) - from .zdf import ZDFIE - _ALL_CLASSES = [ klass for name, klass in globals().items() diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py new file mode 100644 index 000000000..0269d1174 --- /dev/null +++ b/youtube_dl/extractor/bild.py @@ -0,0 +1,39 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class BildIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' + IE_DESC = 'Bild.de' + _TEST = { + 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', + 'md5': 'dd495cbd99f2413502a1713a1156ac8a', + 'info_dict': { + 'id': '38184146', + 'ext': 'mp4', + 'title': 'BILD hat sie getestet', + 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', + 'duration': 196, + 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" + doc = self._download_xml(xml_url, video_id) + + duration = int_or_none(doc.attrib.get('duration'), scale=1000) + + return { + 'id': video_id, + 'title': doc.attrib['ueberschrift'], + 'description': doc.attrib.get('text'), + 'url': doc.attrib['src'], + 'thumbnail': doc.attrib.get('img'), + 'duration': duration, + } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 496271be4..d064a28f9 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -42,7 +42,7 @@ class CinemassacreIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') - mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) + mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) if not mobj: raise ExtractorError('Can\'t extract embed url and video id') playerdata_url = mobj.group('embed_url') @@ -53,17 +53,22 @@ class CinemassacreIE(InfoExtractor): video_description = self._html_search_regex( r'<div class="entry-content">(?P<description>.+?)</div>', webpage, 'description', flags=re.DOTALL, fatal=False) + video_thumbnail = self._og_search_thumbnail(webpage) playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') - video_thumbnail = self._search_regex( - r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) - sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') - videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') + vidurl = self._search_regex( + r'\'vidurl\'\s*:\s*"([^\']+)"', playerdata, 'vidurl').replace('\\/', '/') + vidid = self._search_regex( + r'\'vidid\'\s*:\s*"([^\']+)"', playerdata, 'vidid') + videoserver = self._html_search_regex( + r"'videoserver'\s*:\s*'([^']+)'", playerdata, 'videoserver') + + videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') formats = [] - baseurl = sd_url[:sd_url.rfind('/')+1] + baseurl = vidurl[:vidurl.rfind('/')+1] for video in videolist.findall('.//video'): src = video.get('src') if not src: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e8366f7f9..cf3781cd6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -89,6 +89,10 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. * http_referer HTTP Referer header value to set. * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers @@ -613,12 +617,13 @@ class InfoExtractor(object): audio_ext_preference, f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, + f.get('source_preference') if f.get('source_preference') is not None else -1, f.get('format_id'), ) formats.sort(key=_formats_key) def http_scheme(self): - """ Either "https:" or "https:", depending on the user's preferences """ + """ Either "http:" or "https:", depending on the user's preferences """ return ( 'http:' if self._downloader.params.get('prefer_insecure', False) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f99888ecc..e3057d900 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -39,6 +39,7 @@ class CrunchyrollIE(SubtitlesInfoExtractor): 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', + 'url': 're:(?!.*&)', }, 'params': { # rtmp @@ -237,12 +238,14 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) - streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format) - video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url') - video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path') + streamdata = self._download_xml( + streamdata_req, video_id, + note='Downloading media info for %s' % video_format) + video_url = streamdata.find('.//host').text + video_play_path = streamdata.find('.//file').text formats.append({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_play_path, 'ext': 'flv', 'format': video_format, 'format_id': video_format, diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 07165e330..566e20d76 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -58,7 +58,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): formats.append({ 'url': video_url, 'format_id': format_id, - 'preference': 2, + 'preference': -1, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index d966e8403..ec6d96ada 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -37,7 +37,7 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage) + links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage) if not links: raise ExtractorError('No media links available for %s' % video_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9057a6beb..9b6498894 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -380,6 +380,17 @@ class GenericIE(InfoExtractor): 'uploader': 'education-portal.com', }, }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + }, + }, ] def report_following_redirect(self, new_url): @@ -476,7 +487,8 @@ class GenericIE(InfoExtractor): 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: - assert ':' in default_search + if ':' not in default_search: + default_search += ':' return self.url_result(default_search + url) url, smuggled_data = unsmuggle_url(url) @@ -652,7 +664,7 @@ class GenericIE(InfoExtractor): # Look for embedded Wistia player match = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) if match: embed_url = self._proto_relative_url( unescapeHTML(match.group('url'))) @@ -664,6 +676,7 @@ class GenericIE(InfoExtractor): 'title': video_title, 'id': video_id, } + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) if match: return { diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py new file mode 100644 index 000000000..9561ed5fb --- /dev/null +++ b/youtube_dl/extractor/glide.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class GlideIE(InfoExtractor): + IE_DESC = 'Glide mobile video messages (glide.me)' + _VALID_URL = r'https?://share\.glide\.me/(?P<id>[A-Za-z0-9\-=_+]+)' + _TEST = { + 'url': 'http://share.glide.me/UZF8zlmuQbe4mr+7dCiQ0w==', + 'md5': '4466372687352851af2d131cfaa8a4c7', + 'info_dict': { + 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', + 'ext': 'mp4', + 'title': 'Damon Timm\'s Glide message', + 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( + r'<title>(.*?)</title>', webpage, 'title') + video_url = self.http_scheme() + self._search_regex( + r'<source src="(.*?)" type="video/mp4">', webpage, 'video URL') + thumbnail_url = self._search_regex( + r'<img id="video-thumbnail" src="(.*?)"', + webpage, 'thumbnail url', fatal=False) + thumbnail = ( + thumbnail_url if thumbnail_url is None + else self.http_scheme() + thumbnail_url) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index 5bdd08afa..b6cc15b6f 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -1,37 +1,33 @@ # -*- coding: utf-8 -*- - -import re -import json +from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext + class HarkIE(InfoExtractor): - _VALID_URL = r'https?://www\.hark\.com/clips/(.+?)-.+' + _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+' _TEST = { - u'url': u'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', - u'file': u'mmbzyhkgny.mp3', - u'md5': u'6783a58491b47b92c7c1af5a77d4cbee', - u'info_dict': { - u'title': u"Obama: 'Beyond The Afghan Theater, We Only Target Al Qaeda' on May 23, 2013", - u'description': u'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', - u'duration': 11, + 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', + 'md5': '6783a58491b47b92c7c1af5a77d4cbee', + 'info_dict': { + 'id': 'mmbzyhkgny', + 'ext': 'mp3', + 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013', + 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.', + 'duration': 11, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - json_url = "http://www.hark.com/clips/%s.json" %(video_id) - info_json = self._download_webpage(json_url, video_id) - info = json.loads(info_json) - final_url = info['url'] + video_id = self._match_id(url) + data = self._download_json( + 'http://www.hark.com/clips/%s.json' % video_id, video_id) - return {'id': video_id, - 'url' : final_url, - 'title': info['name'], - 'ext': determine_ext(final_url), - 'description': info['description'], - 'thumbnail': info['image_original'], - 'duration': info['duration'], - } + return { + 'id': video_id, + 'url': data['url'], + 'title': data['name'], + 'description': data.get('description'), + 'thumbnail': data.get('image_original'), + 'duration': data.get('duration'), + } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 979f3d692..6691521e5 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -6,6 +6,7 @@ import json from .common import InfoExtractor from ..utils import ( compat_urllib_parse, + compat_urlparse, get_element_by_attribute, parse_duration, strip_jsonp, @@ -39,13 +40,21 @@ class MiTeleIE(InfoExtractor): ).replace('\'', '"') embed_data = json.loads(embed_data_json) - info_url = embed_data['flashvars']['host'] + domain = embed_data['mediaUrl'] + if not domain.startswith('http'): + # only happens in telecinco.es videos + domain = 'http://' + domain + info_url = compat_urlparse.urljoin( + domain, + compat_urllib_parse.unquote(embed_data['flashvars']['host']) + ) info_el = self._download_xml(info_url, episode).find('./video/info') video_link = info_el.find('videoUrl/link').text token_query = compat_urllib_parse.urlencode({'id': video_link}) token_info = self._download_json( - 'http://token.mitele.es/?' + token_query, episode, + embed_data['flashvars']['ov_tk'] + '?' + token_query, + episode, transform_source=strip_jsonp ) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 6229b2173..3621ff99e 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -5,20 +5,20 @@ import re from .common import InfoExtractor from ..utils import ( - int_or_none, + str_to_int, unified_strdate, ) class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?P<id>[A-Z0-9]+)' + _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [ { 'url': 'http://motherless.com/AC3FFE1', - 'md5': '5527fef81d2e529215dad3c2d744a7d9', + 'md5': '310f62e325a9fafe64f68c0bccb6e75f', 'info_dict': { 'id': 'AC3FFE1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fucked in the ass while playing PS3', 'categories': ['Gaming', 'anal', 'reluctant', 'rough', 'Wife'], 'upload_date': '20100913', @@ -40,33 +40,51 @@ class MotherlessIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg', 'age_limit': 18, } + }, + { + 'url': 'http://motherless.com/g/cosplay/633979F', + 'md5': '0b2a43f447a49c3e649c93ad1fafa4a0', + 'info_dict': { + 'id': '633979F', + 'ext': 'mp4', + 'title': 'Turtlette', + 'categories': ['superheroine heroine superher'], + 'upload_date': '20140827', + 'uploader_id': 'shade0230', + 'thumbnail': 're:http://.*\.jpg', + 'age_limit': 18, + } } ] - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + def _real_extract(self, url): + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') - - video_url = self._html_search_regex(r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video_url') + title = self._html_search_regex( + r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') + video_url = self._html_search_regex( + r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') age_limit = self._rta_search(webpage) - - view_count = self._html_search_regex(r'<strong>Views</strong>\s+([^<]+)<', webpage, 'view_count') + view_count = str_to_int(self._html_search_regex( + r'<strong>Views</strong>\s+([^<]+)<', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._html_search_regex( + r'<strong>Favorited</strong>\s+([^<]+)<', + webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex(r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload_date') + upload_date = self._html_search_regex( + r'<strong>Uploaded</strong>\s+([^<]+)<', webpage, 'upload date') if 'Ago' in upload_date: days = int(re.search(r'([0-9]+)', upload_date).group(1)) upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) - like_count = self._html_search_regex(r'<strong>Favorited</strong>\s+([^<]+)<', webpage, 'like_count') - comment_count = webpage.count('class="media-comment-contents"') - uploader_id = self._html_search_regex(r'"thumb-member-username">\s+<a href="/m/([^"]+)"', webpage, 'uploader_id') + uploader_id = self._html_search_regex( + r'"thumb-member-username">\s+<a href="/m/([^"]+)"', + webpage, 'uploader_id') categories = self._html_search_meta('keywords', webpage) if categories: @@ -79,8 +97,8 @@ class MotherlessIE(InfoExtractor): 'uploader_id': uploader_id, 'thumbnail': self._og_search_thumbnail(webpage), 'categories': categories, - 'view_count': int_or_none(view_count.replace(',', '')), - 'like_count': int_or_none(like_count.replace(',', '')), + 'view_count': view_count, + 'like_count': like_count, 'comment_count': comment_count, 'age_limit': age_limit, 'url': video_url, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8f140d626..6118ed5c2 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -80,8 +80,14 @@ class PBSIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140122', } + }, + { + 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', + 'info_dict': { + 'id': 'united-states-of-secrets', + }, + 'playlist_count': 2, } - ] def _extract_webpage(self, url): @@ -96,6 +102,12 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', webpage, 'upload date', default=None)) + # tabbed frontline videos + tabbed_videos = re.findall( + r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', webpage) + if tabbed_videos: + return tabbed_videos, presumptive_id, upload_date + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer @@ -130,6 +142,12 @@ class PBSIE(InfoExtractor): def _real_extract(self, url): video_id, display_id, upload_date = self._extract_webpage(url) + if isinstance(video_id, list): + entries = [self.url_result( + 'http://video.pbs.org/video/%s' % vid_id, 'PBS', vid_id) + for vid_id in video_id] + return self.playlist_result(entries, display_id) + info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py new file mode 100644 index 000000000..db9788c18 --- /dev/null +++ b/youtube_dl/extractor/telecinco.py @@ -0,0 +1,19 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .mitele import MiTeleIE + + +class TelecincoIE(MiTeleIE): + IE_NAME = 'telecinco.es' + _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/[^/]+/(?P<episode>.*?)\.html' + + _TEST = { + 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', + 'info_dict': { + 'id': 'MDSVID20141015_0058', + 'ext': 'mp4', + 'title': 'Con Martín Berasategui, hacer un bacalao al ...', + 'duration': 662, + }, + } diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 9328ef4a2..0faa729c6 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -1,55 +1,85 @@ -import json -import re +from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, +) class ViddlerIE(InfoExtractor): - _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler\.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)' _TEST = { - u"url": u"http://www.viddler.com/v/43903784", - u'file': u'43903784.mp4', - u'md5': u'fbbaedf7813e514eb7ca30410f439ac9', - u'info_dict': { - u"title": u"Video Made Easy", - u"uploader": u"viddler", - u"duration": 100.89, + "url": "http://www.viddler.com/v/43903784", + 'md5': 'ae43ad7cb59431ce043f0ff7fa13cbf4', + 'info_dict': { + 'id': '43903784', + 'ext': 'mp4', + "title": "Video Made Easy", + 'description': 'You don\'t need to be a professional to make high-quality video content. Viddler provides some quick and easy tips on how to produce great video content with limited resources. ', + "uploader": "viddler", + 'timestamp': 1335371429, + 'upload_date': '20120425', + "duration": 100.89, + 'thumbnail': 're:^https?://.*\.jpg$', + 'view_count': int, + 'categories': ['video content', 'high quality video', 'video made easy', 'how to produce video with limited resources', 'viddler'], } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - embed_url = mobj.group('domain') + u'/embed/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - video_sources_code = self._search_regex( - r"(?ms)sources\s*:\s*(\{.*?\})", webpage, u'video URLs') - video_sources = json.loads(video_sources_code.replace("'", '"')) - - formats = [{ - 'url': video_url, - 'format': format_id, - } for video_url, format_id in video_sources.items()] - - title = self._html_search_regex( - r"title\s*:\s*'([^']*)'", webpage, u'title') - uploader = self._html_search_regex( - r"authorName\s*:\s*'([^']*)'", webpage, u'uploader', fatal=False) - duration_s = self._html_search_regex( - r"duration\s*:\s*([0-9.]*)", webpage, u'duration', fatal=False) - duration = float(duration_s) if duration_s else None - thumbnail = self._html_search_regex( - r"thumbnail\s*:\s*'([^']*)'", - webpage, u'thumbnail', fatal=False) + video_id = self._match_id(url) + + json_url = ( + 'http://api.viddler.com/api/v2/viddler.videos.getPlaybackDetails.json?video_id=%s&key=v0vhrt7bg2xq1vyxhkct' % + video_id) + data = self._download_json(json_url, video_id)['video'] + + formats = [] + for filed in data['files']: + if filed.get('status', 'ready') != 'ready': + continue + f = { + 'format_id': filed['profile_id'], + 'format_note': filed['profile_name'], + 'url': self._proto_relative_url(filed['url']), + 'width': int_or_none(filed.get('width')), + 'height': int_or_none(filed.get('height')), + 'filesize': int_or_none(filed.get('size')), + 'ext': filed.get('ext'), + 'source_preference': -1, + } + formats.append(f) + + if filed.get('cdn_url'): + f = f.copy() + f['url'] = self._proto_relative_url(filed['cdn_url']) + f['format_id'] = filed['profile_id'] + '-cdn' + f['source_preference'] = 1 + formats.append(f) + + if filed.get('html5_video_source'): + f = f.copy() + f['url'] = self._proto_relative_url( + filed['html5_video_source']) + f['format_id'] = filed['profile_id'] + '-html5' + f['source_preference'] = 0 + formats.append(f) + self._sort_formats(formats) + + categories = [ + t.get('text') for t in data.get('tags', []) if 'text' in t] return { '_type': 'video', 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'duration': duration, + 'title': data['title'], 'formats': formats, + 'description': data.get('description'), + 'timestamp': int_or_none(data.get('upload_time')), + 'thumbnail': self._proto_relative_url(data.get('thumbnail_url')), + 'uploader': data.get('author'), + 'duration': float_or_none(data.get('length')), + 'view_count': int_or_none(data.get('view_count')), + 'categories': categories, } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py new file mode 100644 index 000000000..669979e13 --- /dev/null +++ b/youtube_dl/extractor/vidzi.py @@ -0,0 +1,33 @@ +#coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VidziIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' + _TEST = { + 'url': 'http://vidzi.tv/cghql9yq6emu.html', + 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', + 'info_dict': { + 'id': 'cghql9yq6emu', + 'ext': 'mp4', + 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + video_url = self._html_search_regex( + r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + title = self._html_search_regex( + r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } +
\ No newline at end of file diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 649361bde..2ccc63fc5 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -159,6 +159,11 @@ def parseOpts(overrideArguments=None): '--ignore-config', action='store_true', help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') + general.add_option( + '--flat-playlist', + action='store_const', dest='extract_flat', const='in_playlist', + default=False, + help='Do not extract the videos of a playlist, only list them.') selection = optparse.OptionGroup(parser, 'Video Selection') selection.add_option( @@ -413,6 +418,10 @@ def parseOpts(overrideArguments=None): action='store_true', dest='dumpjson', default=False, help='simulate, quiet but print JSON information. See --output for a description of available keys.') verbosity.add_option( + '-J', '--dump-single-json', + action='store_true', dest='dump_single_json', default=False, + help='simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.') + verbosity.add_option( '--newline', action='store_true', dest='progress_with_newline', default=False, help='output progress bar as new lines') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e7f6adef1..d822ae330 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.10.18' +__version__ = '2014.10.25' |