diff options
Diffstat (limited to 'youtube_dl')
26 files changed, 369 insertions, 166 deletions
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 8ecabab1a..088f59586 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -4,12 +4,19 @@ import re import subprocess import sys import time -import traceback if os.name == 'nt': import ctypes -from .utils import * +from .utils import ( + compat_urllib_error, + compat_urllib_request, + ContentTooShortError, + determine_ext, + encodeFilename, + sanitize_open, + timeconvert, +) class FileDownloader(object): @@ -194,7 +201,7 @@ class FileDownloader(object): if old_filename == new_filename: return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) - except (IOError, OSError) as err: + except (IOError, OSError): self.report_error(u'unable to rename file') def try_utime(self, filename, last_modified_hdr): @@ -227,8 +234,14 @@ class FileDownloader(object): if self.params.get('noprogress', False): return clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') - eta_str = self.format_eta(eta) - percent_str = self.format_percent(percent) + if eta is not None: + eta_str = self.format_eta(eta) + else: + eta_str = 'Unknown ETA' + if percent is not None: + percent_str = self.format_percent(percent) + else: + percent_str = 'Unknown %' speed_str = self.format_speed(speed) if self.params.get('progress_with_newline', False): self.to_screen(u'[download] %s of %s at %s ETA %s' % @@ -251,7 +264,7 @@ class FileDownloader(object): """Report file has already been fully downloaded.""" try: self.to_screen(u'[download] %s has already been downloaded' % file_name) - except (UnicodeEncodeError) as err: + except UnicodeEncodeError: self.to_screen(u'[download] The file has already been downloaded') def report_unable_to_resume(self): @@ -267,7 +280,7 @@ class FileDownloader(object): self.to_screen(u'\r%s[download] 100%% of %s in %s' % (clear_line, data_len_str, self.format_seconds(tot_time))) - def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): + def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live): self.report_destination(filename) tmpfilename = self.temp_name(filename) test = self.params.get('test', False) @@ -294,6 +307,8 @@ class FileDownloader(object): basic_args += ['--tcUrl', url] if test: basic_args += ['--stop', '1'] + if live: + basic_args += ['--live'] args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): try: @@ -366,15 +381,20 @@ class FileDownloader(object): self.report_destination(filename) tmpfilename = self.temp_name(filename) - args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename] - # Check for ffmpeg first - try: - subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) - except (OSError, IOError): - self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] ) - return False + args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', tmpfilename] - retval = subprocess.call(args) + for program in ['avconv', 'ffmpeg']: + try: + subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) + break + except (OSError, IOError): + pass + else: + self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found') + cmd = [program] + args + + retval = subprocess.call(cmd) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) @@ -411,7 +431,8 @@ class FileDownloader(object): info_dict.get('player_url', None), info_dict.get('page_url', None), info_dict.get('play_path', None), - info_dict.get('tc_url', None)) + info_dict.get('tc_url', None), + info_dict.get('rtmp_live', False)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): @@ -550,12 +571,11 @@ class FileDownloader(object): # Progress message speed = self.calc_speed(start, time.time(), byte_counter - resume_len) if data_len is None: - self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') - eta = None + eta = percent = None else: percent = self.calc_percent(byte_counter, data_len) eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) - self.report_progress(percent, data_len_str, speed, eta) + self.report_progress(percent, data_len_str, speed, eta) self._hook_progress({ 'downloaded_bytes': byte_counter, diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 13b56ede5..69aedf87a 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -501,7 +501,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor): options = ['-c', 'copy'] for (name, value) in metadata.items(): - options.extend(['-metadata', '%s="%s"' % (name, value)]) + options.extend(['-metadata', '%s=%s' % (name, value)]) options.extend(['-f', ext]) self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 86a6fd043..5253c39e1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -782,7 +782,7 @@ class YoutubeDL(object): def list_formats(self, info_dict): def line(format): - return (u'%-15s%-10s%-12s%s' % ( + return (u'%-20s%-10s%-12s%s' % ( format['format_id'], format['ext'], self.format_resolution(format), diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 48ffcbf8e..254fcd39c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -32,6 +32,7 @@ __authors__ = ( 'Ismael Mejía', 'Steffan \'Ruirize\' James', 'Andras Elso', + 'Jelle van der Waa', ) __license__ = 'Public Domain' @@ -349,7 +350,7 @@ def parseOpts(overrideArguments=None): 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) filesystem.add_option('--autonumber-size', dest='autonumber_size', metavar='NUMBER', - help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --autonumber option is given') + help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') filesystem.add_option('--restrict-filenames', action='store_true', dest='restrictfilenames', help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False) @@ -358,7 +359,7 @@ def parseOpts(overrideArguments=None): filesystem.add_option('-w', '--no-overwrites', action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) filesystem.add_option('-c', '--continue', - action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True) + action='store_true', dest='continue_dl', help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.', default=True) filesystem.add_option('--no-continue', action='store_false', dest='continue_dl', help='do not resume partially downloaded files (restart from beginning)') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78f84cea3..0594a3666 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -38,6 +38,7 @@ from .defense import DefenseGouvFrIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE +from .eitb import EitbIE from .escapist import EscapistIE from .exfm import ExfmIE from .extremetube import ExtremeTubeIE @@ -56,6 +57,7 @@ from .francetv import ( ) from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE +from .gamekings import GamekingsIE from .gamespot import GameSpotIE from .gametrailers import GametrailersIE from .generic import GenericIE @@ -115,6 +117,7 @@ from .slideshare import SlideshareIE from .sohu import SohuIE from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE from .southparkstudios import SouthParkStudiosIE +from .space import SpaceIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e10c74c11..b35a679e3 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,6 +10,7 @@ from ..utils import ( unified_strdate, determine_ext, get_element_by_id, + compat_str, ) # There are different sources of video in arte.tv, the extraction process @@ -181,20 +182,30 @@ class ArteTVPlus7IE(InfoExtractor): formats = all_formats else: raise ExtractorError(u'The formats list is empty') - # We order the formats by quality + if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: - sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) + def sort_key(f): + return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) else: - sort_key = lambda f: int(f.get('height',-1)) + def sort_key(f): + return ( + # Sort first by quality + int(f.get('height',-1)), + int(f.get('bitrate',-1)), + # The original version with subtitles has lower relevance + re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, + # The version with sourds/mal subtitles has also lower relevance + re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, + ) formats = sorted(formats, key=sort_key) - # Prefer videos without subtitles in the same language - formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) - # Pick the best quality def _format(format_info): - quality = format_info['quality'] - m_quality = re.match(r'\w*? - (\d*)p', quality) - if m_quality is not None: - quality = m_quality.group(1) + quality = '' + height = format_info.get('height') + if height is not None: + quality = compat_str(height) + bitrate = format_info.get('bitrate') + if bitrate is not None: + quality += '-%d' % bitrate if format_info.get('versionCode') is not None: format_id = u'%s-%s' % (quality, format_info['versionCode']) else: @@ -203,7 +214,7 @@ class ArteTVPlus7IE(InfoExtractor): 'format_id': format_id, 'format_note': format_info.get('versionLibelle'), 'width': format_info.get('width'), - 'height': format_info.get('height'), + 'height': height, } if format_info['mediaType'] == u'rtmp': info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0d9b87a34..d8c35465a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,10 +9,13 @@ from ..utils import ( compat_urllib_parse, find_xpath_attr, compat_urlparse, + compat_str, + compat_urllib_request, ExtractorError, ) + class BrightcoveIE(InfoExtractor): _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)' _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -41,6 +44,17 @@ class BrightcoveIE(InfoExtractor): u'uploader': u'Oracle', }, }, + { + # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ + u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', + u'info_dict': { + u'id': u'2750934548001', + u'ext': u'mp4', + u'title': u'This Bracelet Acts as a Personal Thermostat', + u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0', + u'uploader': u'Mashable', + }, + }, ] @classmethod @@ -68,24 +82,48 @@ class BrightcoveIE(InfoExtractor): videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer.attrib['value'] + linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') + if linkBase is not None: + params['linkBaseURL'] = linkBase.attrib['value'] data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data + @classmethod + def _extract_brightcove_url(cls, webpage): + """Try to extract the brightcove url from the wepbage, returns None + if it can't be found + """ + m_brightcove = re.search( + r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', + webpage, re.DOTALL) + if m_brightcove is not None: + return cls._build_brighcove_url(m_brightcove.group()) + else: + return None + def _real_extract(self, url): + # Change the 'videoId' and others field to '@videoPlayer' + url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) + # Change bckey (used by bcove.me urls) to playerKey + url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = re.match(self._VALID_URL, url) query_str = mobj.group('query') query = compat_urlparse.parse_qs(query_str) videoPlayer = query.get('@videoPlayer') if videoPlayer: - return self._get_video_info(videoPlayer[0], query_str) + return self._get_video_info(videoPlayer[0], query_str, query) else: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) - def _get_video_info(self, video_id, query): - request_url = self._FEDERATED_URL_TEMPLATE % query - webpage = self._download_webpage(request_url, video_id) + def _get_video_info(self, video_id, query_str, query): + request_url = self._FEDERATED_URL_TEMPLATE % query_str + req = compat_urllib_request.Request(request_url) + linkBase = query.get('linkBaseURL') + if linkBase is not None: + req.add_header('Referer', linkBase[0]) + webpage = self._download_webpage(req, video_id) self.report_extraction(video_id) info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') @@ -109,7 +147,7 @@ class BrightcoveIE(InfoExtractor): def _extract_video_info(self, video_info): info = { - 'id': video_info['id'], + 'id': compat_str(video_info['id']), 'title': video_info['displayName'], 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -119,10 +157,11 @@ class BrightcoveIE(InfoExtractor): renditions = video_info.get('renditions') if renditions: renditions = sorted(renditions, key=lambda r: r['size']) - best_format = renditions[-1] - info.update({ - 'url': best_format['defaultURL'], - }) + info['formats'] = [{ + 'url': rend['defaultURL'], + 'height': rend.get('frameHeight'), + 'width': rend.get('frameWidth'), + } for rend in renditions] elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 8f9396d6b..f0d08cebf 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -65,6 +65,7 @@ class CinemassacreIE(InfoExtractor): { 'url': url, 'play_path': 'mp4:' + sd_file, + 'rtmp_live': True, # workaround 'ext': 'flv', 'format': 'sd', 'format_id': 'sd', @@ -72,6 +73,7 @@ class CinemassacreIE(InfoExtractor): { 'url': url, 'play_path': 'mp4:' + hd_file, + 'rtmp_live': True, # workaround 'ext': 'flv', 'format': 'hd', 'format_id': 'hd', diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a79f881cd..34adf6dda 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -6,7 +6,7 @@ from ..utils import determine_ext class CNNIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ + _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/ (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))''' _TESTS = [{ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e0ccba533..9c20d30b4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -322,6 +322,8 @@ class InfoExtractor(object): if name is None: name = 'OpenGraph %s' % prop escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) + if escaped is None: + return None return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 355b4ed0a..e87690f9d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -141,9 +141,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError(u'Unable to extract video URL') # subtitles - video_subtitles = self.extract_subtitles(video_id) + video_subtitles = self.extract_subtitles(video_id, webpage) if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id) + self._list_available_subtitles(video_id, webpage) return return { @@ -157,7 +157,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): 'age_limit': age_limit, } - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py new file mode 100644 index 000000000..4ba323148 --- /dev/null +++ b/youtube_dl/extractor/eitb.py @@ -0,0 +1,37 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import ExtractorError + + +class EitbIE(InfoExtractor): + IE_NAME = u'eitb.tv' + _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + + _TEST = { + u'add_ie': ['Brightcove'], + u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + u'md5': u'edf4436247185adee3ea18ce64c47998', + u'info_dict': { + u'id': u'2743577154001', + u'ext': u'mp4', + u'title': u'60 minutos (Lasa y Zabala, 30 años)', + # All videos from eitb has this description in the brightcove info + u'description': u'.', + u'uploader': u'Euskal Telebista', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + chapter_id = mobj.group('chapter_id') + webpage = self._download_webpage(url, chapter_id) + bc_url = BrightcoveIE._extract_brightcove_url(webpage) + if bc_url is None: + raise ExtractorError(u'Could not extract the Brightcove url') + # The BrightcoveExperience object doesn't contain the video id, we set + # it manually + bc_url += '&%40videoPlayer={0}'.format(chapter_id) + return self.url_result(bc_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py new file mode 100644 index 000000000..4b4259447 --- /dev/null +++ b/youtube_dl/extractor/gamekings.py @@ -0,0 +1,40 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class GamekingsIE(InfoExtractor): + _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' + _TEST = { + u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", + u'file': u'20130811.mp4', + u'md5': u'17f6088f7d0149ff2b46f2714bdb1954', + u'info_dict': { + u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review", + u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", + } + } + + def _real_extract(self, url): + + mobj = re.match(self._VALID_URL, url) + name = mobj.group('name') + webpage = self._download_webpage(url, name) + video_url = self._og_search_video_url(webpage) + + video = re.search(r'[0-9]+', video_url) + video_id = video.group(0) + + # Todo: add medium format + video_url = video_url.replace(video_id, 'large/' + video_id) + + return { + 'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b3fec8e86..c7552fddb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -33,6 +33,7 @@ class GenericIE(InfoExtractor): }, # embedded vimeo video { + u'add_ie': ['Vimeo'], u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', u'file': u'22444065.mp4', u'md5': u'2903896e23df39722c33f015af0666e2', @@ -44,6 +45,7 @@ class GenericIE(InfoExtractor): }, # bandcamp page with custom domain { + u'add_ie': ['Bandcamp'], u'url': u'http://bronyrock.com/track/the-pony-mash', u'file': u'3235767654.mp3', u'info_dict': { @@ -52,6 +54,23 @@ class GenericIE(InfoExtractor): }, u'skip': u'There is a limit of 200 free downloads / month for the test song', }, + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' in the + # http requests + { + u'add_ie': ['Brightcove'], + u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', + u'info_dict': { + u'id': u'2765128793001', + u'ext': u'mp4', + u'title': u'Le cours de bourse : l’analyse technique', + u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', + u'uploader': u'BFM BUSINESS', + }, + u'params': { + u'skip_download': True, + }, + }, ] def report_download_webpage(self, video_id): @@ -144,10 +163,9 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) # Look for BrightCove: - m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) - if m_brightcove is not None: + bc_url = BrightcoveIE._extract_brightcove_url(webpage) + if bc_url is not None: self.to_screen(u'Brightcove video detected.') - bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) return self.url_result(bc_url, 'Brightcove') # Look for embedded Vimeo player diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 445d46501..50916f4a6 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -1,8 +1,10 @@ import re +import hashlib from .common import InfoExtractor from ..utils import determine_ext +_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class KankanIE(InfoExtractor): _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' @@ -30,7 +32,10 @@ class KankanIE(InfoExtractor): video_id, u'Downloading video url info') ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip') path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') - video_url = 'http://%s%s' % (ip, path) + param1 = self._search_regex(r'param1:(\d+)', video_info_page, u'param1') + param2 = self._search_regex(r'param2:(\d+)', video_info_page, u'param2') + key = _md5('xl_mp43651' + param1 + param2) + video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2) return {'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e96d3952c..24a79ae13 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -26,6 +26,7 @@ class MTVIE(InfoExtractor): }, }, { + u'add_ie': ['Vevo'], u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', u'file': u'USCJY1331283.mp4', u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py index 2cba53076..f5003c7f9 100644 --- a/youtube_dl/extractor/slashdot.py +++ b/youtube_dl/extractor/slashdot.py @@ -7,6 +7,7 @@ class SlashdotIE(InfoExtractor): _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)' _TEST = { + u'add_ie': ['Ooyala'], u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz', u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4', u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 29cd5617c..4717fbb77 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,17 +29,34 @@ class SoundcloudIE(InfoExtractor): ) ''' IE_NAME = u'soundcloud' - _TEST = { - u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', - u'file': u'62986583.mp3', - u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', - u'info_dict': { - u"upload_date": u"20121011", - u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", - u"uploader": u"E.T. ExTerrestrial Music", - u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" - } - } + _TESTS = [ + { + u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', + u'file': u'62986583.mp3', + u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', + u'info_dict': { + u"upload_date": u"20121011", + u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", + u"uploader": u"E.T. ExTerrestrial Music", + u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" + } + }, + # not streamable song + { + u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', + u'info_dict': { + u'id': u'47127627', + u'ext': u'mp3', + u'title': u'Goldrushed', + u'uploader': u'The Royal Concept', + u'upload_date': u'20120521', + }, + u'params': { + # rtmp + u'skip_download': True, + }, + }, + ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -56,16 +73,16 @@ class SoundcloudIE(InfoExtractor): return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID def _extract_info_dict(self, info, full_title=None, quiet=False): - video_id = info['id'] - name = full_title or video_id + track_id = compat_str(info['id']) + name = full_title or track_id if quiet == False: self.report_extraction(name) thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') - return { - 'id': info['id'], + result = { + 'id': track_id, 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'uploader': info['user']['username'], 'upload_date': unified_strdate(info['created_at']), @@ -74,6 +91,21 @@ class SoundcloudIE(InfoExtractor): 'description': info['description'], 'thumbnail': thumbnail, } + if info.get('downloadable', False): + result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) + if not info.get('streamable', False): + # We have to get the rtmp url + stream_json = self._download_webpage( + 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID), + track_id, u'Downloading track url') + rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url'] + # The url doesn't have an rtmp app, we have to extract the playpath + url, path = rtmp_url.split('mp3:', 1) + result.update({ + 'url': url, + 'play_path': 'mp3:' + path, + }) + return result def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) @@ -106,70 +138,8 @@ class SoundcloudIE(InfoExtractor): class SoundcloudSetIE(SoundcloudIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$' IE_NAME = u'soundcloud:set' - _TEST = { - u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep", - u"playlist": [ - { - u"file":"30510138.mp3", - u"md5":"f9136bf103901728f29e419d2c70f55d", - u"info_dict": { - u"upload_date": u"20111213", - u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", - u"uploader": u"The Royal Concept", - u"title": u"D-D-Dance" - } - }, - { - u"file":"47127625.mp3", - u"md5":"09b6758a018470570f8fd423c9453dd8", - u"info_dict": { - u"upload_date": u"20120521", - u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", - u"uploader": u"The Royal Concept", - u"title": u"The Royal Concept - Gimme Twice" - } - }, - { - u"file":"47127627.mp3", - u"md5":"154abd4e418cea19c3b901f1e1306d9c", - u"info_dict": { - u"upload_date": u"20120521", - u"uploader": u"The Royal Concept", - u"title": u"Goldrushed" - } - }, - { - u"file":"47127629.mp3", - u"md5":"2f5471edc79ad3f33a683153e96a79c1", - u"info_dict": { - u"upload_date": u"20120521", - u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", - u"uploader": u"The Royal Concept", - u"title": u"In the End" - } - }, - { - u"file":"47127631.mp3", - u"md5":"f9ba87aa940af7213f98949254f1c6e2", - u"info_dict": { - u"upload_date": u"20120521", - u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", - u"uploader": u"The Royal Concept", - u"title": u"Knocked Up" - } - }, - { - u"file":"75206121.mp3", - u"md5":"f9d1fe9406717e302980c30de4af9353", - u"info_dict": { - u"upload_date": u"20130116", - u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ", - u"uploader": u"The Royal Concept", - u"title": u"World On Fire" - } - } - ] - } + # it's in tests/test_playlists.py + _TESTS = [] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -208,7 +178,7 @@ class SoundcloudUserIE(SoundcloudIE): IE_NAME = u'soundcloud:user' # it's in tests/test_playlists.py - _TEST = None + _TESTS = [] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py new file mode 100644 index 000000000..0d32a0688 --- /dev/null +++ b/youtube_dl/extractor/space.py @@ -0,0 +1,35 @@ +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import RegexNotFoundError, ExtractorError + + +class SpaceIE(InfoExtractor): + _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html' + _TEST = { + u'add_ie': ['Brightcove'], + u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', + u'info_dict': { + u'id': u'2780937028001', + u'ext': u'mp4', + u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video', + u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61', + u'uploader': u'TechMedia Networks', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title = mobj.group('title') + webpage = self._download_webpage(url, title) + try: + # Some videos require the playerKey field, which isn't define in + # the BrightcoveExperience object + brightcove_url = self._og_search_video_url(webpage) + except RegexNotFoundError: + # Other videos works fine with the info from the object + brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) + if brightcove_url is None: + raise ExtractorError(u'The webpage does not contain a video', expected=True) + return self.url_result(brightcove_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 90de7de3a..4b4c5235d 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor): return any([self._downloader.params.get('writesubtitles', False), self._downloader.params.get('writeautomaticsub')]) - def _list_available_subtitles(self, video_id, webpage=None): + def _list_available_subtitles(self, video_id, webpage): """ outputs the available subtitles for the video """ - sub_lang_list = self._get_available_subtitles(video_id) + sub_lang_list = self._get_available_subtitles(video_id, webpage) auto_captions_list = self._get_available_automatic_caption(video_id, webpage) sub_lang = ",".join(list(sub_lang_list.keys())) self.to_screen(u'%s: Available subtitles for video: %s' % @@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor): self.to_screen(u'%s: Available automatic captions for video: %s' % (video_id, auto_lang)) - def extract_subtitles(self, video_id, video_webpage=None): + def extract_subtitles(self, video_id, webpage): """ returns {sub_lang: sub} ,{} if subtitles not found or None if the subtitles aren't requested. @@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor): return None available_subs_list = {} if self._downloader.params.get('writeautomaticsub', False): - available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) + available_subs_list.update(self._get_available_automatic_caption(video_id, webpage)) if self._downloader.params.get('writesubtitles', False): - available_subs_list.update(self._get_available_subtitles(video_id)) + available_subs_list.update(self._get_available_subtitles(video_id, webpage)) if not available_subs_list: # error, it didn't get the available subtitles return {} @@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor): return return sub - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): """ returns {sub_lang: url} or {} if not available Must be redefined by the subclasses diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dfa1176a3..76cfdfb90 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -1,10 +1,14 @@ import json import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( + compat_str, + RegexNotFoundError, +) -class TEDIE(InfoExtractor): +class TEDIE(SubtitlesInfoExtractor): _VALID_URL=r'''http://www\.ted\.com/ ( ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist @@ -32,7 +36,7 @@ class TEDIE(InfoExtractor): def _real_extract(self, url): m=re.match(self._VALID_URL, url, re.VERBOSE) if m.group('type_talk'): - return [self._talk_info(url)] + return self._talk_info(url) else : playlist_id=m.group('playlist_id') name=m.group('name') @@ -82,11 +86,21 @@ class TEDIE(InfoExtractor): 'url': stream['file'], 'format': stream['id'] } for stream in info['htmlStreams']] + + video_id = info['id'] + + # subtitles + video_subtitles = self.extract_subtitles(video_id, webpage) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, webpage) + return + info = { - 'id': info['id'], + 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'description': desc, + 'subtitles': video_subtitles, 'formats': formats, } @@ -94,3 +108,17 @@ class TEDIE(InfoExtractor): info.update(info['formats'][-1]) return info + + def _get_available_subtitles(self, video_id, webpage): + try: + options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL) + languages = re.findall(r'(?:<option value=")(\S+)"', options) + if languages: + sub_lang_list = {} + for l in languages: + url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) + sub_lang_list[l] = url + return sub_lang_list + except RegexNotFoundError as err: + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c4ec1f06f..651ba317d 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,7 +27,7 @@ class VineIE(InfoExtractor): video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"', webpage, u'video URL') - uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', + uploader = self._html_search_regex(r'<p class="username">(.*?)</p>', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0757495bd..fa784ab99 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -13,6 +13,7 @@ class WeiboIE(InfoExtractor): _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm' _TEST = { + u'add_ie': ['Sina'], u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm', u'file': u'98322879.flv', u'info_dict': { diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 8a0eb1afd..1177a4b14 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -9,7 +9,7 @@ from ..utils import ( class XNXXIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' + _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)' VIDEO_URL_RE = r'flv_url=(.*?)&' VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM' VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 74a381fe2..c992cba97 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1082,7 +1082,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1572,7 +1572,6 @@ class YoutubePlaylistIE(InfoExtractor): class YoutubeChannelIE(InfoExtractor): IE_DESC = u'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" - _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' @@ -1593,30 +1592,20 @@ class YoutubeChannelIE(InfoExtractor): # Download channel page channel_id = mobj.group(1) video_ids = [] - pagenum = 1 - url = self._TEMPLATE_URL % (channel_id, pagenum) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) + # Download all channel pages using the json-based channel_ajax query + for pagenum in itertools.count(1): + url = self._MORE_PAGES_URL % (pagenum, channel_id) + page = self._download_webpage(url, channel_id, + u'Downloading page #%s' % pagenum) - # Extract video identifiers - ids_in_page = self.extract_videos_from_page(page) - video_ids.extend(ids_in_page) + page = json.loads(page) - # Download any subsequent channel pages using the json-based channel_ajax query - if self._MORE_PAGES_INDICATOR in page: - for pagenum in itertools.count(1): - url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - page = json.loads(page) - - ids_in_page = self.extract_videos_from_page(page['content_html']) - video_ids.extend(ids_in_page) + ids_in_page = self.extract_videos_from_page(page['content_html']) + video_ids.extend(ids_in_page) - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: - break + if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + break self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cc0f9cb4e..338e7ba1f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.03' +__version__ = '2013.11.11' |