diff options
Diffstat (limited to 'youtube_dl')
25 files changed, 424 insertions, 163 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 806e7b239..24d6c2de7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -63,6 +63,7 @@ from .utils import ( YoutubeDLHandler, prepend_extension, args_to_str, + age_restricted, ) from .cache import Cache from .extractor import get_info_extractor, gen_extractors @@ -550,13 +551,8 @@ class YoutubeDL(object): max_views = self.params.get('max_views') if max_views is not None and view_count > max_views: return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) - age_limit = self.params.get('age_limit') - if age_limit is not None: - actual_age_limit = info_dict.get('age_limit') - if actual_age_limit is None: - actual_age_limit = 0 - if age_limit < actual_age_limit: - return 'Skipping "' + title + '" because it is age restricted' + if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): + return 'Skipping "%s" because it is age restricted' % title if self.in_download_archive(info_dict): return '%s has already been recorded in archive' % video_title return None @@ -790,7 +786,7 @@ class YoutubeDL(object): if video_formats: return video_formats[0] else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a'] + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] if format_spec in extensions: filter_f = lambda f: f['ext'] == format_spec else: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 56f560d26..4aa7fba6a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -38,7 +38,7 @@ from .update import update_self from .downloader import ( FileDownloader, ) -from .extractor import gen_extractors +from .extractor import gen_extractors, list_extractors from .YoutubeDL import YoutubeDL @@ -95,17 +95,15 @@ def _real_main(argv=None): _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] - extractors = gen_extractors() - if opts.list_extractors: - for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()): + for ie in list_extractors(opts.age_limit): compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else '')) matchedUrls = [url for url in all_urls if ie.suitable(url)] for mu in matchedUrls: compat_print(' ' + mu) sys.exit(0) if opts.list_extractor_descriptions: - for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()): + for ie in list_extractors(opts.age_limit): if not ie._WORKING: continue desc = getattr(ie, 'IE_DESC', ie.IE_NAME) @@ -365,3 +363,5 @@ def main(argv=None): sys.exit('ERROR: fixed output name but more than one file to download') except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') + +__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8e47bd60d..8dacc2c54 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -159,6 +159,7 @@ from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE from .giantbomb import GiantBombIE +from .giga import GigaIE from .glide import GlideIE from .globo import GloboIE from .godtube import GodTubeIE @@ -574,6 +575,17 @@ def gen_extractors(): return [klass() for klass in _ALL_CLASSES] +def list_extractors(age_limit): + """ + Return a list of extractors that are suitable for the given age, + sorted by extractor ID. + """ + + return sorted( + filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), + key=lambda ie: ie.IE_NAME.lower()) + + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 241b904a9..75d744852 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, int_or_none, unified_strdate, ) @@ -54,45 +52,38 @@ class BiliBiliIE(InfoExtractor): thumbnail = self._html_search_meta( 'thumbnailUrl', video_code, 'thumbnail', fatal=False) - player_params = compat_parse_qs(self._html_search_regex( - r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"', - webpage, 'player params')) + cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - if 'cid' in player_params: - cid = player_params['cid'][0] + lq_doc = self._download_xml( + 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, + video_id, + note='Downloading LQ video info' + ) + lq_durl = lq_doc.find('./durl') + formats = [{ + 'format_id': 'lq', + 'quality': 1, + 'url': lq_durl.find('./url').text, + 'filesize': int_or_none( + lq_durl.find('./size'), get_attr='text'), + }] - lq_doc = self._download_xml( - 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid, - video_id, - note='Downloading LQ video info' - ) - lq_durl = lq_doc.find('.//durl') - formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, + hq_doc = self._download_xml( + 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, + video_id, + note='Downloading HQ video info', + fatal=False, + ) + if hq_doc is not False: + hq_durl = hq_doc.find('./durl') + formats.append({ + 'format_id': 'hq', + 'quality': 2, + 'ext': 'flv', + 'url': hq_durl.find('./url').text, 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), - }] - - hq_doc = self._download_xml( - 'http://interface.bilibili.cn/playurl?cid=%s' % cid, - video_id, - note='Downloading HQ video info', - fatal=False, - ) - if hq_doc is not False: - hq_durl = hq_doc.find('.//durl') - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, - 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), - }) - else: - raise ExtractorError('Unsupported player parameters: %r' % (player_params,)) + hq_durl.find('./size'), get_attr='text'), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index ba8376338..f70e090bb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, @@ -15,7 +15,7 @@ from ..utils import ( ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' _TESTS = [ @@ -104,6 +104,17 @@ class CeskaTelevizeIE(InfoExtractor): duration = float_or_none(item.get('duration')) thumbnail = item.get('previewImageUrl') + subtitles = {} + subs = item.get('subtitles') + if subs: + subtitles['cs'] = subs[0]['url'] + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + return { 'id': episode_id, 'title': title, @@ -111,4 +122,34 @@ class CeskaTelevizeIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + if subtitles is None: + return subtitles # subtitles not requested + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{0} --> {1}".format(start, stop) + else: + yield line + + fixed_subtitles = {} + for k, v in subtitles.items(): + fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) + return fixed_subtitles diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 562e656e0..d703893dc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + age_restricted, clean_html, compiled_regex_type, ExtractorError, @@ -593,7 +594,7 @@ class InfoExtractor(object): return self._html_search_regex( r'''(?isx)<meta (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name), + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -877,6 +878,35 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if 'playlist' in tc: + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index d09d1c13a..190d9f9ad 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -13,7 +13,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', @@ -26,29 +26,32 @@ class FKTVIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - episode = int(mobj.group('ep')) + episode = int(self._match_id(url)) - server = random.randint(2, 4) - video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode - start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, + video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode + start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode, episode) playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage, 'playlist', flags=re.DOTALL) files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) - # TODO: return a single multipart video + videos = [] for i, _ in enumerate(files, 1): video_id = '%04d%d' % (episode, i) - video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) + video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i) videos.append({ + 'ext': 'flv', 'id': video_id, 'url': video_url, 'title': clean_html(get_element_by_id('eptitle', start_webpage)), 'description': clean_html(get_element_by_id('contentlist', start_webpage)), 'thumbnail': video_thumbnail }) - return videos + return { + '_type': 'multi_video', + 'entries': videos, + 'id': 'folge-%s' % episode, + } class FKTVPosteckeIE(InfoExtractor): diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py index 75f180928..a07d69841 100644 --- a/youtube_dl/extractor/gameone.py +++ b/youtube_dl/extractor/gameone.py @@ -57,8 +57,7 @@ class GameOneIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage, secure=False) diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py new file mode 100644 index 000000000..775890112 --- /dev/null +++ b/youtube_dl/extractor/giga.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + qualities, + compat_str, + parse_duration, + parse_iso8601, + str_to_int, +) + + +class GigaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/', + 'md5': '6bc5535e945e724640664632055a584f', + 'info_dict': { + 'id': '2622086', + 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss', + 'ext': 'mp4', + 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', + 'description': 'md5:afdf5862241aded4718a30dff6a57baf', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 578, + 'timestamp': 1414749706, + 'upload_date': '20141031', + 'uploader': 'Robin Schweiger', + 'view_count': int, + }, + }, { + 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/', + 'only_matching': True, + }, { + 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/', + 'only_matching': True, + }, { + 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'], + webpage, 'video id') + + playlist = self._download_json( + 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/' + % video_id, video_id)[0] + + quality = qualities(['normal', 'hd720']) + + formats = [] + for format_id in itertools.count(0): + fmt = playlist.get(compat_str(format_id)) + if not fmt: + break + formats.append({ + 'url': fmt['src'], + 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), + 'quality': quality(fmt['quality']), + }) + self._sort_formats(formats) + + title = self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) + + duration = parse_duration(self._search_regex( + r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id), + webpage, 'duration', fatal=False)) + + timestamp = parse_iso8601(self._search_regex( + r'datetime="([^"]+)"', webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False) + + view_count = str_to_int(self._search_regex( + r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 4ccf6b9b8..a38eae421 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -39,8 +39,9 @@ class HuffPostIE(InfoExtractor): data = self._download_json(api_url, video_id)['data'] video_title = data['title'] - duration = parse_duration(data['running_time']) - upload_date = unified_strdate(data['schedule']['starts_at']) + duration = parse_duration(data.get('running_time')) + upload_date = unified_strdate( + data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time')) description = data.get('description') thumbnails = [] @@ -59,16 +60,11 @@ class HuffPostIE(InfoExtractor): 'ext': 'mp4', 'url': url, 'vcodec': 'none' if key.startswith('audio/') else None, - } for key, url in data['sources']['live'].items()] - if data.get('fivemin_id'): - fid = data['fivemin_id'] - fcat = str(int(fid) // 100 + 1) - furl = 'http://avideos.5min.com/2/' + fcat[-3:] + '/' + fcat + '/' + fid + '.mp4' - formats.append({ - 'format': 'fivemin', - 'url': furl, - 'preference': 1, - }) + } for key, url in data.get('sources', {}).get('live', {}).items()] + + if not formats and data.get('fivemin_id'): + return self.url_result('5min:%s' % data['fivemin_id']) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 13a53a0cb..f29df36b5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -16,7 +16,6 @@ class ImdbIE(InfoExtractor): _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', - 'md5': '9f34fa777ade3a6e57a054fdbcb3a068', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 78787e8f1..3c61a850f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -105,6 +105,9 @@ class OCWMITIE(InfoExtractor): 'ext': 'mp4', 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', + 'upload_date': '20121109', + 'uploader_id': 'MIT', + 'uploader': 'MIT OpenCourseWare', # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' } }, @@ -114,6 +117,9 @@ class OCWMITIE(InfoExtractor): 'id': '7K1sB05pE0A', 'ext': 'mp4', 'title': 'Session 1: Introduction to Derivatives', + 'upload_date': '20090818', + 'uploader_id': 'MIT', + 'uploader': 'MIT OpenCourseWare', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' } diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index f5ca74e97..c1a482dba 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -1,63 +1,49 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib -import json -import time - from .common import InfoExtractor from ..compat import ( - compat_parse_qs, - compat_str, -) -from ..utils import ( - int_or_none, + compat_urlparse, ) class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])' + _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', - 'md5': '5592cb7c5005d9b2c163df5ac3dc04e4', 'info_dict': { - 'id': '7063', + 'id': '2-T3WuR-KMM', 'ext': 'mp4', 'title': 'Red Bull Racing: 2014 Rules Explained', - 'duration': 207, + 'duration': 208, 'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.', - 'uploader': 'rainiere', - 'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$' - } + 'uploader': 'mcomstaff', + 'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ', + 'upload_date': '20140903', + 'thumbnail': r're:^https?://.+\.jpg$' + }, + 'add_ie': ['Youtube'], + 'params': { + 'skip_download': True, + }, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - flashvars_code = self._html_search_regex( - r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars') - flashvars = compat_parse_qs(flashvars_code) - params = json.loads(flashvars['parameters'][0]) - - e = compat_str(int(time.time()) + 24 * 60 * 60) - base_video_url = params['location'] + '?e=' + e - s = 'h3hg713fh32' - h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest() - video_url = base_video_url + '&h=' + h - - uploader = self._html_search_regex( - r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage, - 'uploader', fatal=False) + iframe_path = self._html_search_regex( + r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, + 'iframe path') + iframe = self._download_webpage( + compat_urlparse.urljoin(url, iframe_path), display_id, + 'Downloading iframe') + youtube_id = self._search_regex( + r'www.youtube.com/embed/(.{11})', iframe, 'youtube id') return { - 'id': params['video_id'], + '_type': 'url_transparent', 'display_id': display_id, - 'title': params['title'], - 'url': video_url, - 'description': params.get('description'), - 'thumbnail': params.get('main_thumb'), - 'duration': int_or_none(params.get('duration')), - 'uploader': uploader, + 'url': 'https://youtube.com/watch?v=%s' % youtube_id, } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 43e8e619f..321ce5ce7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,7 +72,7 @@ class NRKIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { @@ -85,7 +85,7 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, - } + }, }, { 'url': 'http://tv.nrk.no/program/mdfp15000514', @@ -97,39 +97,119 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', 'duration': 4605.0, - } + }, }, + { + # single playlist video + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + 'skip': 'Only works from Norway', + }, + { + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [ + { + 'md5': '9480285eff92d64f06e02a5367970a7a', + 'info_dict': { + 'id': 'MSPO40010515-part1', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + { + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + ], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + 'duration': 6947.5199999999995, + }, + 'skip': 'Only works from Norway', + } ] + def _extract_f4m(self, manifest_url, video_id): + return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - - page = self._download_webpage(url, video_id) - - title = self._html_search_meta('title', page, 'title') - description = self._html_search_meta('description', page, 'description') - thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) - upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) - duration = float_or_none( - self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)) + part_id = mobj.group('part_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'title', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = self._html_search_regex( + r'data-posterimage="([^"]+)"', + webpage, 'thumbnail', fatal=False) + upload_date = unified_strdate(self._html_search_meta( + 'rightsfrom', webpage, 'upload date', fatal=False)) + duration = float_or_none(self._html_search_regex( + r'data-duration="([^"]+)"', + webpage, 'duration', fatal=False)) + + # playlist + parts = re.findall( + r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) + if parts: + entries = [] + for current_part_id, stream_url, part_title in parts: + if part_id and current_part_id != part_id: + continue + video_part_id = '%s-part%s' % (video_id, current_part_id) + formats = self._extract_f4m(stream_url, video_part_id) + entries.append({ + 'id': video_part_id, + 'title': part_title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + if part_id: + if entries: + return entries[0] + else: + playlist = self.playlist_result(entries, video_id, title, description) + playlist.update({ + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + }) + return playlist formats = [] - f4m_url = re.search(r'data-media="([^"]+)"', page) + f4m_url = re.search(r'data-media="([^"]+)"', webpage) if f4m_url: - formats.append({ - 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - 'format_id': 'f4m', - 'ext': 'flv', - }) + formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - m3u8_url = re.search(r'data-hls-media="([^"]+)"', page) + m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) if m3u8_url: - formats.append({ - 'url': m3u8_url.group(1), - 'format_id': 'm3u8', - }) + formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4')) self._sort_formats(formats) diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py index c833fc8ee..6446d26dc 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/sexykarma.py @@ -24,7 +24,7 @@ class SexyKarmaIE(InfoExtractor): 'title': 'Taking a quick pee.', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'wildginger7', - 'upload_date': '20141007', + 'upload_date': '20141008', 'duration': 22, 'view_count': int, 'comment_count': int, @@ -45,6 +45,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }, { 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', @@ -61,6 +62,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }] @@ -114,4 +116,5 @@ class SexyKarmaIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 6c3445d79..82675431f 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -57,9 +57,7 @@ class TeacherTubeIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta('title', webpage, 'title', fatal=True) diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 4ce5aeeba..b6b1f2568 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -24,7 +24,7 @@ class TuneInIE(InfoExtractor): _INFO_DICT = { 'id': '34682', 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', - 'ext': 'AAC', + 'ext': 'aac', 'thumbnail': 're:^https?://.*\.png$', 'location': 'Tacoma, WA', } @@ -78,14 +78,21 @@ class TuneInIE(InfoExtractor): for stream in streams: if stream.get('Type') == 'Live': is_live = True + reliability = stream.get('Reliability') + format_note = ( + 'Reliability: %d%%' % reliability + if reliability is not None else None) formats.append({ + 'preference': ( + 0 if reliability is None or reliability > 90 + else 1), 'abr': stream.get('Bandwidth'), - 'ext': stream.get('MediaType'), + 'ext': stream.get('MediaType').lower(), 'acodec': stream.get('MediaType'), 'vcodec': 'none', 'url': stream.get('Url'), - # Sometimes streams with the highest quality do not exist - 'preference': stream.get('Reliability'), + 'source_preference': reliability, + 'format_note': format_note, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 15f315298..944901e14 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -17,7 +17,6 @@ class VikiIE(SubtitlesInfoExtractor): _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' _TEST = { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', - 'md5': 'a21454021c2646f5433514177e2caa5f', 'info_dict': { 'id': '1023585v', 'ext': 'mp4', @@ -31,8 +30,7 @@ class VikiIE(SubtitlesInfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 88bbbb219..c17bebd6e 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -10,14 +10,14 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' _TEST = { 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/', 'info_dict': { 'title': 'Sinkhole of bureaucracy', }, 'playlist': [{ - 'md5': 'c3f4b4922ffa259243f68e928db2db8c', + 'md5': '79132cc09ec5309fa590ae46e4cc31bc', 'info_dict': { 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', @@ -29,7 +29,7 @@ class WashingtonPostIE(InfoExtractor): 'upload_date': '20140322', }, }, { - 'md5': 'f645a07652c2950cd9134bb852c5f5eb', + 'md5': 'e1d5734c06865cc504ad99dc2de0d443', 'info_dict': { 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', @@ -44,10 +44,9 @@ class WashingtonPostIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - + page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) + title = self._og_search_title(webpage) uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage) entries = [] diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 95f1c8f3c..e8490b028 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -95,6 +95,7 @@ class XTubeUserIE(InfoExtractor): 'url': 'http://www.xtube.com/community/profile.php?user=greenshowers', 'info_dict': { 'id': 'greenshowers', + 'age_limit': 18, }, 'playlist_mincount': 155, } @@ -124,6 +125,7 @@ class XTubeUserIE(InfoExtractor): return { '_type': 'playlist', 'id': username, + 'age_limit': 18, 'entries': [{ '_type': 'url', 'url': eurl, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d1bbf0b01..e71956071 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -287,7 +287,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -412,7 +414,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': 'md5:eca57043abae25130f58f655ad9a7771', + 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'upload_date': '20140605', @@ -1046,7 +1048,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): for f in formats: if f['format_id'] in dash_keys: f['format_id'] = 'nondash-%s' % f['format_id'] - f['preference'] -= 10000 + f['preference'] = f.get('preference', 0) - 10000 formats.extend(dash_formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 74c76a9a0..98f15177b 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -119,7 +119,7 @@ class ZDFChannelIE(InfoExtractor): 'info_dict': { 'id': '1586442', }, - 'playlist_count': 4, + 'playlist_count': 3, } _PAGE_SIZE = 50 diff --git a/youtube_dl/options.py b/youtube_dl/options.py index a018c1d71..058342dec 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -267,10 +267,12 @@ def parseOpts(overrideArguments=None): action='store', dest='format', metavar='FORMAT', default=None, help=( 'video format code, specify the order of preference using' - ' slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also' - ' supported. You can also use the special names "best",' - ' "bestvideo", "bestaudio", "worst", "worstvideo" and' - ' "worstaudio". By default, youtube-dl will pick the best quality.' + ' slashes, as in -f 22/17/18 . ' + ' Instead of format codes, you can select by extension for the ' + 'extensions aac, m4a, mp3, mp4, ogg, wav, webm. ' + 'You can also use the special names "best",' + ' "bestvideo", "bestaudio", "worst". ' + ' By default, youtube-dl will pick the best quality.' ' Use commas to download multiple audio formats, such as' ' -f 136/137/mp4/bestvideo,140/m4a/bestaudio.' ' You can merge the video and audio of two formats into a single' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d4951c406..29739a483 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1560,3 +1560,13 @@ def urlhandle_detect_ext(url_handle): getheader = url_handle.info().getheader return getheader('Content-Type').split("/")[1] + + +def age_restricted(content_limit, age_limit): + """ Returns True iff the content should be blocked """ + + if age_limit is None: # No limit set + return False + if content_limit is None: + return False # Content available for everyone + return age_limit < content_limit diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 086f0ebf0..32019e362 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.01.05' +__version__ = '2015.01.08' |