diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/__init__.py | 24 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/arte.py | 63 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/statigram.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/ted.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/tf1.py | 35 | ||||
-rw-r--r-- | youtube_dl/extractor/tumblr.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/tutv.py | 41 | ||||
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/wat.py | 84 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 5 | ||||
-rw-r--r-- | youtube_dl/update.py | 2 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
14 files changed, 247 insertions, 29 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 6334ce3c4..98388a9f3 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -118,6 +118,7 @@ def parseOpts(overrideArguments=None): selection = optparse.OptionGroup(parser, 'Video Selection') authentication = optparse.OptionGroup(parser, 'Authentication Options') video_format = optparse.OptionGroup(parser, 'Video Format Options') + downloader = optparse.OptionGroup(parser, 'Download Options') postproc = optparse.OptionGroup(parser, 'Post-processing Options') filesystem = optparse.OptionGroup(parser, 'Filesystem Options') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') @@ -130,15 +131,6 @@ def parseOpts(overrideArguments=None): action='store_true', dest='update_self', help='update this program to latest version') general.add_option('-i', '--ignore-errors', action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) - general.add_option('-r', '--rate-limit', - dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') - general.add_option('-R', '--retries', - dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) - general.add_option('--buffer-size', - dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024") - general.add_option('--no-resize-buffer', - action='store_true', dest='noresizebuffer', - help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) general.add_option('--dump-user-agent', action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False) @@ -152,7 +144,7 @@ def parseOpts(overrideArguments=None): help='List all supported extractors and the URLs they would handle', default=False) general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') - general.add_option('--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP) + selection.add_option('--playlist-start', dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is %default)', default=1) @@ -211,6 +203,17 @@ def parseOpts(overrideArguments=None): action='store', dest='subtitleslang', metavar='LANG', help='language of the subtitles to download (optional) use IETF language tags like \'en\'') + downloader.add_option('-r', '--rate-limit', + dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') + downloader.add_option('-R', '--retries', + dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10) + downloader.add_option('--buffer-size', + dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024") + downloader.add_option('--no-resize-buffer', + action='store_true', dest='noresizebuffer', + help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) + downloader.add_option('--test', action='store_true', dest='test', default=False, help=optparse.SUPPRESS_HELP) + verbosity.add_option('-q', '--quiet', action='store_true', dest='quiet', help='activates quiet mode', default=False) verbosity.add_option('-s', '--simulate', @@ -317,6 +320,7 @@ def parseOpts(overrideArguments=None): parser.add_option_group(general) parser.add_option_group(selection) + parser.add_option_group(downloader) parser.add_option_group(filesystem) parser.add_option_group(verbosity) parser.add_option_group(video_format) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ec800d9fb..ba0e86713 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -47,14 +47,17 @@ from .statigram import StatigramIE from .steam import SteamIE from .teamcoco import TeamcocoIE from .ted import TEDIE +from .tf1 import TF1IE from .traileraddict import TrailerAddictIE from .tudou import TudouIE from .tumblr import TumblrIE +from .tutv import TutvIE from .ustream import UstreamIE from .vbox7 import Vbox7IE from .vevo import VevoIE from .vimeo import VimeoIE from .vine import VineIE +from .wat import WatIE from .wimp import WimpIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b061b9566..183274eb7 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -11,11 +11,21 @@ from ..utils import ( ) class ArteTvIE(InfoExtractor): - _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + """ + There are two sources of video in arte.tv: videos.arte.tv and + www.arte.tv/guide, the extraction process is different for each one. + The videos expire in 7 days, so we can't add tests. + """ + _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' + @classmethod + def suitable(cls, url): + return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) + # TODO implement Live Stream # def extractLiveStream(self, url): # video_lang = url.split('/')[-4] @@ -44,17 +54,26 @@ class ArteTvIE(InfoExtractor): # video_url = u'%s/%s' % (info.get('url'), info.get('path')) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') + mobj = re.match(self._EMISSION_URL, url) + if mobj is not None: + name = mobj.group('name') + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') + return self._extract_emission(url, video_id) + + mobj = re.match(self._VIDEOS_URL, url) + if mobj is not None: + id = mobj.group('id') + return self._extract_video(url, id) if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) # return + def _extract_emission(self, url, video_id): + """Extract from www.arte.tv/guide""" webpage = self._download_webpage(url, video_id) json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') @@ -68,6 +87,7 @@ class ArteTvIE(InfoExtractor): 'description': player_info['VDE'], 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), 'thumbnail': player_info['programImage'], + 'ext': 'flv', } formats = player_info['VSR'].values() @@ -78,9 +98,36 @@ class ArteTvIE(InfoExtractor): if format_info['mediaType'] == u'rtmp': info_dict['url'] = format_info['streamer'] info_dict['play_path'] = 'mp4:' + format_info['url'] - info_dict['ext'] = 'mp4' else: info_dict['url'] = format_info['url'] - info_dict['ext'] = 'mp4' return info_dict + + def _extract_video(self, url, video_id): + """Extract from videos.arte.tv""" + config_xml_url = url.replace('/videos/', '/do_delegate/videos/') + config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml') + config_xml = self._download_webpage(config_xml_url, video_id) + config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url') + config_xml = self._download_webpage(config_xml_url, video_id) + + video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml)) + def _key(m): + quality = m.group('quality') + if quality == 'hd': + return 2 + else: + return 1 + # We pick the best quality + video_urls = sorted(video_urls, key=_key) + video_url = list(video_urls)[-1].group('url') + + title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title') + thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', + config_xml, 'thumbnail') + return {'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'url': video_url, + 'ext': 'flv', + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 64d63e109..5c6fd7945 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ class InfoExtractor(object): location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). subtitles: The subtitle file contents. + view_count: How many users have watched the video on the platform. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index 25bf5b85c..ae9a63e8b 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -10,7 +10,7 @@ class StatigramIE(InfoExtractor): u'md5': u'deda4ff333abe2e118740321e992605b', u'info_dict': { u"uploader_id": u"videoseconds", - u"title": u"Instagram photo by @videoseconds (Videos)" + u"title": u"Instagram photo by @videoseconds" } } @@ -27,7 +27,7 @@ class StatigramIE(InfoExtractor): html_title = self._html_search_regex( r'<title>(.+?)</title>', webpage, u'title') - title = html_title.rpartition(u' | Statigram')[0] + title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title) uploader_id = self._html_search_regex( r'@([^ ]+)', title, u'uploader name', fatal=False) ext = 'mp4' diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 46b66582c..8b73b8340 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -17,7 +17,7 @@ class TEDIE(InfoExtractor): _TEST = { u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', u'file': u'102.mp4', - u'md5': u'8cd9dfa41ee000ce658fd48fb5d89a61', + u'md5': u'2d76ee1576672e0bd8f187513267adf6', u'info_dict': { u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922", u"title": u"Dan Dennett: The illusion of consciousness" diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py new file mode 100644 index 000000000..e0ffeced5 --- /dev/null +++ b/youtube_dl/extractor/tf1.py @@ -0,0 +1,35 @@ +# coding: utf-8 + +import json +import re + +from .common import InfoExtractor + +class TF1IE(InfoExtractor): + """ + TF1 uses the wat.tv player, currently it can only download videos with the + html5 player enabled, it cannot download HD videos. + """ + _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' + _TEST = { + u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', + u'file': u'10635995.mp4', + u'md5': u'66789d3e91278d332f75e1feb7aea327', + u'info_dict': { + u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle', + u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + id = mobj.group(1) + webpage = self._download_webpage(url, id) + embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"', + webpage, 'embed url') + embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page') + wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') + wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info') + wat_info = json.loads(wat_info)['media'] + wat_url = wat_info['url'] + return self.url_result(wat_url, 'Wat') diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 243f04bff..ad5840ca2 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -9,11 +9,11 @@ from ..utils import ( class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' _TEST = { - u'url': u'http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja', - u'file': u'53364321212.mp4', - u'md5': u'0716d3dd51baf68a28b40fdf1251494e', + u'url': u'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', + u'file': u'54196191430.mp4', + u'md5': u'479bb068e5b16462f5176a6828829767', u'info_dict': { - u"title": u"Rafael Lemos" + u"title": u"tatiana maslany news" } } diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py new file mode 100644 index 000000000..fcaa6ac01 --- /dev/null +++ b/youtube_dl/extractor/tutv.py @@ -0,0 +1,41 @@ +import base64 +import re + +from .common import InfoExtractor +from ..utils import ( + compat_parse_qs, +) + +class TutvIE(InfoExtractor): + _VALID_URL=r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)' + _TEST = { + u'url': u'http://tu.tv/videos/noah-en-pabellon-cuahutemoc', + u'file': u'2742556.flv', + u'md5': u'5eb766671f69b82e528dc1e7769c5cb2', + u'info_dict': { + u"title": u"Noah en pabellon cuahutemoc" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( + r'<meta property="og:title" content="(.*?)">', webpage, u'title') + internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') + + data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) + data_content = self._download_webpage(data_url, video_id, note=u'Downloading video info') + data = compat_parse_qs(data_content) + video_url = base64.b64decode(data['kpt'][0]).decode('utf-8') + ext = video_url.partition(u'?')[0].rpartition(u'.')[2] + + info = { + 'id': internal_id, + 'url': video_url, + 'ext': ext, + 'title': title, + } + return [info] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 11741e27d..7c4562790 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -16,7 +16,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)' + _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$' IE_NAME = u'vimeo' _TEST = { u'url': u'http://vimeo.com/56015672', diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py new file mode 100644 index 000000000..0d1302cd2 --- /dev/null +++ b/youtube_dl/extractor/wat.py @@ -0,0 +1,84 @@ +# coding: utf-8 + +import json +import re + +from .common import InfoExtractor + +from ..utils import ( + compat_urllib_parse, + unified_strdate, +) + + +class WatIE(InfoExtractor): + _VALID_URL=r'http://www.wat.tv/.*-(?P<shortID>.*?)_.*?.html' + IE_NAME = 'wat.tv' + _TEST = { + u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', + u'file': u'10631273.mp4', + u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a', + u'info_dict': { + u'title': u'World War Z - Philadelphia VOST', + u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', + } + } + + def download_video_info(self, real_id): + # 'contentv4' is used in the website, but it also returns the related + # videos, we don't need them + info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info') + info = json.loads(info) + return info['media'] + + + def _real_extract(self, url): + def real_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] + mobj = re.match(self._VALID_URL, url) + short_id = mobj.group('shortID') + webpage = self._download_webpage(url, short_id) + real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') + + video_info = self.download_video_info(real_id) + chapters = video_info['chapters'] + first_chapter = chapters[0] + + if real_id_for_chapter(first_chapter) != real_id: + self.to_screen('Multipart video detected') + chapter_urls = [] + for chapter in chapters: + chapter_id = real_id_for_chapter(chapter) + # Yes, when we this chapter is processed by WatIE, + # it will download the info again + chapter_info = self.download_video_info(chapter_id) + chapter_urls.append(chapter_info['url']) + entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] + return self.playlist_result(entries, real_id, video_info['title']) + + # Otherwise we can continue and extract just one part, we have to use + # the short id for getting the video url + player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id, + 'html5': '1'}) + player_info = self._download_webpage('http://www.wat.tv/player?' + player_data, + real_id, u'Downloading player info') + player = json.loads(player_info)['player'] + html5_player = self._html_search_regex(r'iframe src="(.*?)"', player, + 'html5 player') + player_webpage = self._download_webpage(html5_player, real_id, + u'Downloading player webpage') + + video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage, + 'video url') + info = {'id': real_id, + 'url': video_url, + 'ext': 'mp4', + 'title': first_chapter['title'], + 'thumbnail': first_chapter['preview'], + 'description': first_chapter['description'], + 'view_count': video_info['views'], + } + if 'date_diffusion' in first_chapter: + info['upload_date'] = unified_strdate(first_chapter['date_diffusion']) + + return info diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 96d8257d9..109c8a93f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -168,7 +168,7 @@ class YoutubeIE(InfoExtractor): self.to_screen(u'RTMP download detected') def _decrypt_signature(self, s): - """Decrypt the key""" + """Turn the encrypted s field into a working signature""" if len(s) == 88: return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] @@ -402,6 +402,9 @@ class YoutubeIE(InfoExtractor): return video_id def _real_extract(self, url): + if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url): + self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).') + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: diff --git a/youtube_dl/update.py b/youtube_dl/update.py index eab8417a5..ccab6f27f 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -44,7 +44,7 @@ def update_self(to_screen, verbose, filename): if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, "frozen"): - to_screen(u'It looks like you installed youtube-dl with pip, setup.py or a tarball. Please use that to update.') + to_screen(u'It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.') return # Check if there is a new version diff --git a/youtube_dl/version.py b/youtube_dl/version.py index af2af2d8b..7bba3a883 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.06.34.2' +__version__ = '2013.06.34.4' |