diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/behindkink.py | 29 | ||||
-rw-r--r-- | youtube_dl/extractor/bet.py | 108 | ||||
-rw-r--r-- | youtube_dl/extractor/bliptv.py | 38 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 50 | ||||
-rw-r--r-- | youtube_dl/extractor/facebook.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/ntv.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/smotri.py | 35 | ||||
-rw-r--r-- | youtube_dl/extractor/tvigle.py | 31 | ||||
-rw-r--r-- | youtube_dl/extractor/tvplay.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 143 | ||||
-rw-r--r-- | youtube_dl/extractor/zdf.py | 52 |
12 files changed, 385 insertions, 122 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c0dcdaf02..6b7660ab1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -30,6 +30,7 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE @@ -525,7 +526,7 @@ from .youtube import ( YoutubeUserIE, YoutubeWatchLaterIE, ) -from .zdf import ZDFIE +from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ( ZingMp3SongIE, ZingMp3AlbumIE, diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index 31fdc0dcc..1bdc25812 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -10,15 +10,15 @@ from ..utils import url_basename class BehindKinkIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { - 'url': 'http://www.behindkink.com/2014/08/14/ab1576-performers-voice-finally-heard-the-bill-is-killed/', - 'md5': '41ad01222b8442089a55528fec43ec01', + 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', + 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', 'info_dict': { - 'id': '36370', + 'id': '37127', 'ext': 'mp4', - 'title': 'AB1576 - PERFORMERS VOICE FINALLY HEARD - THE BILL IS KILLED!', - 'description': 'The adult industry voice was finally heard as Assembly Bill 1576 remained\xa0 in suspense today at the Senate Appropriations Hearing. AB1576 was, among other industry damaging issues, a condom mandate...', - 'upload_date': '20140814', - 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/08/36370_AB1576_Win.jpg', + 'title': 'What are you passionate about – Marley Blaze', + 'description': 'md5:aee8e9611b4ff70186f752975d9b94b4', + 'upload_date': '20141205', + 'thumbnail': 'http://www.behindkink.com/wp-content/uploads/2014/12/blaze-1.jpg', 'age_limit': 18, } } @@ -26,26 +26,19 @@ class BehindKinkIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') - year = mobj.group('year') - month = mobj.group('month') - day = mobj.group('day') - upload_date = year + month + day webpage = self._download_webpage(url, display_id) video_url = self._search_regex( - r"'file':\s*'([^']+)'", - webpage, 'URL base') - - video_id = url_basename(video_url) - video_id = video_id.split('_')[0] + r'<source src="([^"]+)"', webpage, 'video URL') + video_id = url_basename(video_url).split('_')[0] + upload_date = mobj.group('year') + mobj.group('month') + mobj.group('day') return { 'id': video_id, + 'display_id': display_id, 'url': video_url, - 'ext': 'mp4', 'title': self._og_search_title(webpage), - 'display_id': display_id, 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), 'upload_date': upload_date, diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py new file mode 100644 index 000000000..c1fc433f7 --- /dev/null +++ b/youtube_dl/extractor/bet.py @@ -0,0 +1,108 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + xpath_text, + xpath_with_ns, + int_or_none, + parse_iso8601, +) + + +class BetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' + _TESTS = [ + { + 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', + 'info_dict': { + 'id': '417cd61c-c793-4e8e-b006-e445ecc45add', + 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', + 'ext': 'flv', + 'title': 'BET News Presents: A Conversation With President Obama', + 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6', + 'duration': 1534, + 'timestamp': 1418075340, + 'upload_date': '20141208', + 'uploader': 'admin', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', + 'info_dict': { + 'id': '4160e53b-ad41-43b1-980f-8d85f63121f4', + 'display_id': 'justice-for-ferguson-a-community-reacts', + 'ext': 'flv', + 'title': 'Justice for Ferguson: A Community Reacts', + 'description': 'A BET News special.', + 'duration': 1696, + 'timestamp': 1416942360, + 'upload_date': '20141125', + 'uploader': 'admin', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + media_url = compat_urllib_parse.unquote(self._search_regex( + [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], + webpage, 'media URL')) + + mrss = self._download_xml(media_url, display_id) + + item = mrss.find('./channel/item') + + NS_MAP = { + 'dc': 'http://purl.org/dc/elements/1.1/', + 'media': 'http://search.yahoo.com/mrss/', + 'ka': 'http://kickapps.com/karss', + } + + title = xpath_text(item, './title', 'title') + description = xpath_text( + item, './description', 'description', fatal=False) + + video_id = xpath_text(item, './guid', 'video id', fatal=False) + + timestamp = parse_iso8601(xpath_text( + item, xpath_with_ns('./dc:date', NS_MAP), + 'upload date', fatal=False)) + uploader = xpath_text( + item, xpath_with_ns('./dc:creator', NS_MAP), + 'uploader', fatal=False) + + media_content = item.find( + xpath_with_ns('./media:content', NS_MAP)) + duration = int_or_none(media_content.get('duration')) + smil_url = media_content.get('url') + + thumbnail = media_content.find( + xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') + + formats = self._extract_smil_formats(smil_url, display_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index da47f27bd..14b814120 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -4,13 +4,17 @@ import re from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor -from ..utils import ( + +from ..compat import ( + compat_str, compat_urllib_request, - unescapeHTML, - parse_iso8601, compat_urlparse, +) +from ..utils import ( clean_html, - compat_str, + int_or_none, + parse_iso8601, + unescapeHTML, ) @@ -78,7 +82,25 @@ class BlipTVIE(SubtitlesInfoExtractor): 'uploader': 'NostalgiaCritic', 'uploader_id': '246467', } - } + }, + { + # https://github.com/rg3/youtube-dl/pull/4404 + 'note': 'Audio only', + 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', + 'md5': '76c0a56f24e769ceaab21fbb6416a351', + 'info_dict': { + 'id': '7103299', + 'ext': 'flv', + 'title': 'Weekly Manga Recap: Kingdom', + 'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', + 'timestamp': 1417660321, + 'upload_date': '20141204', + 'uploader': 'The Rollo T', + 'uploader_id': '407429', + 'duration': 7251, + 'vcodec': 'none', + } + }, ] def _real_extract(self, url): @@ -145,11 +167,11 @@ class BlipTVIE(SubtitlesInfoExtractor): 'url': real_url, 'format_id': role, 'format_note': media_type, - 'vcodec': media_content.get(blip('vcodec')), + 'vcodec': media_content.get(blip('vcodec')) or 'none', 'acodec': media_content.get(blip('acodec')), 'filesize': media_content.get('filesize'), - 'width': int(media_content.get('width')), - 'height': int(media_content.get('height')), + 'width': int_or_none(media_content.get('width')), + 'height': int_or_none(media_content.get('height')), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2faaf6226..2277ec6ab 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -174,9 +174,10 @@ class InfoExtractor(object): _type "url" indicates that the video must be extracted from another location, possibly by a different extractor. Its only required key is: "url" - the next URL to extract. - - Additionally, it may have properties believed to be identical to the - resolved entity, for example "title" if the title of the referred video is + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is known ahead of time. @@ -792,6 +793,49 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + # TODO: improve extraction + def _extract_smil_formats(self, smil_url, video_id): + smil = self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file') + + base = smil.find('./head/meta').get('base') + + formats = [] + rtmp_count = 0 + for video in smil.findall('./body/switch/video'): + src = video.get('src') + if not src: + continue + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + if not proto: + if base: + if base.startswith('rtmp'): + proto = 'rtmp' + elif base.startswith('http'): + proto = 'http' + ext = video.get('ext') + if proto == 'm3u8': + formats.extend(self._extract_m3u8_formats(src, video_id, ext)) + elif proto == 'rtmp': + rtmp_count += 1 + streamer = video.get('streamer') or base + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + self._sort_formats(formats) + + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 2139f68aa..1ad4e77a8 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -13,9 +13,10 @@ from ..compat import ( compat_urllib_request, ) from ..utils import ( - urlencode_postdata, ExtractorError, + int_or_none, limit_length, + urlencode_postdata, ) @@ -36,7 +37,6 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '637842556329505', 'ext': 'mp4', - 'duration': 38, 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', } }, { @@ -107,9 +107,7 @@ class FacebookIE(InfoExtractor): self._login() def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) url = 'https://www.facebook.com/video/video.php?v=%s' % video_id webpage = self._download_webpage(url, video_id) @@ -149,6 +147,6 @@ class FacebookIE(InfoExtractor): 'id': video_id, 'title': video_title, 'url': video_url, - 'duration': int(video_data['video_duration']), - 'thumbnail': video_data['thumbnail_src'], + 'duration': int_or_none(video_data.get('video_duration')), + 'thumbnail': video_data.get('thumbnail_src'), } diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py index 13c8d79cd..ee740cd9c 100644 --- a/youtube_dl/extractor/ntv.py +++ b/youtube_dl/extractor/ntv.py @@ -130,7 +130,7 @@ class NTVIE(InfoExtractor): 'rtmp_conn': 'B:1', 'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128', 'page_url': 'http://www.ntv.ru', - 'flash_ver': 'LNX 11,2,202,341', + 'flash_version': 'LNX 11,2,202,341', 'rtmp_live': True, 'ext': 'flv', 'filesize': int(size.text), diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 0751efc61..646af3cc9 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -274,15 +274,18 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError('Broadcast %s does not exist' % broadcast_id, expected=True) + raise ExtractorError( + 'Broadcast %s does not exist' % broadcast_id, expected=True) # Adult content if re.search('EroConfirmText">', broadcast_page) is not None: (username, password) = self._get_login_info() if username is None: - raise ExtractorError('Erotic broadcasts allowed only for registered users, ' - 'use --username and --password options to provide account credentials.', expected=True) + raise ExtractorError( + 'Erotic broadcasts allowed only for registered users, ' + 'use --username and --password options to provide account credentials.', + expected=True) login_form = { 'login-hint53': '1', @@ -291,9 +294,11 @@ class SmotriBroadcastIE(InfoExtractor): 'password': password, } - request = compat_urllib_request.Request(broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) + request = compat_urllib_request.Request( + broadcast_url + '/?no_redirect=1', compat_urllib_parse.urlencode(login_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage(request, broadcast_id, 'Logging in and confirming age') + broadcast_page = self._download_webpage( + request, broadcast_id, 'Logging in and confirming age') if re.search('>Неверный логин или пароль<', broadcast_page) is not None: raise ExtractorError('Unable to log in: bad username or password', expected=True) @@ -303,7 +308,7 @@ class SmotriBroadcastIE(InfoExtractor): adult_content = False ticket = self._html_search_regex( - 'window\.broadcast_control\.addFlashVar\\(\'file\', \'([^\']+)\'\\);', + r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'([^']+)'\)", broadcast_page, 'broadcast ticket') url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket @@ -312,26 +317,31 @@ class SmotriBroadcastIE(InfoExtractor): if broadcast_password: url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - broadcast_json_page = self._download_webpage(url, broadcast_id, 'Downloading broadcast JSON') + broadcast_json_page = self._download_webpage( + url, broadcast_id, 'Downloading broadcast JSON') try: broadcast_json = json.loads(broadcast_json_page) protected_broadcast = broadcast_json['_pass_protected'] == 1 if protected_broadcast and not broadcast_password: - raise ExtractorError('This broadcast is protected by a password, use the --video-password option', expected=True) + raise ExtractorError( + 'This broadcast is protected by a password, use the --video-password option', + expected=True) broadcast_offline = broadcast_json['is_play'] == 0 if broadcast_offline: raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) rtmp_url = broadcast_json['_server'] - if not rtmp_url.startswith('rtmp://'): + mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) + if not mobj: raise ExtractorError('Unexpected broadcast rtmp URL') broadcast_playpath = broadcast_json['_streamName'] + broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) broadcast_thumbnail = broadcast_json['_imgURL'] - broadcast_title = broadcast_json['title'] + broadcast_title = self._live_title(broadcast_json['title']) broadcast_description = broadcast_json['description'] broadcaster_nick = broadcast_json['nick'] broadcaster_login = broadcast_json['login'] @@ -352,6 +362,9 @@ class SmotriBroadcastIE(InfoExtractor): 'age_limit': 18 if adult_content else 0, 'ext': 'flv', 'play_path': broadcast_playpath, + 'player_url': 'http://pics.smotri.com/broadcast_play.swf', + 'app': broadcast_app, 'rtmp_live': True, - 'rtmp_conn': rtmp_conn + 'rtmp_conn': rtmp_conn, + 'is_live': True, } diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index d81d1d1a6..ba65996dc 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -1,32 +1,30 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( float_or_none, - str_to_int, + parse_age_limit, ) class TvigleIE(InfoExtractor): IE_NAME = 'tvigle' IE_DESC = 'Интернет-телевидение Tvigle.ru' - _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$' + _VALID_URL = r'http://(?:www\.)?tvigle\.ru/(?:[^/]+/)+(?P<id>[^/]+)/$' _TESTS = [ { - 'url': 'http://www.tvigle.ru/video/brat/', - 'md5': 'ff4344a4894b0524441fb6f8218dc716', + 'url': 'http://www.tvigle.ru/video/sokrat/', + 'md5': '36514aed3657d4f70b4b2cef8eb520cd', 'info_dict': { - 'id': '5118490', - 'display_id': 'brat', - 'ext': 'mp4', - 'title': 'Брат', - 'description': 'md5:d16ac7c0b47052ea51fddb92c4e413eb', - 'duration': 5722.6, - 'age_limit': 16, + 'id': '1848932', + 'display_id': 'sokrat', + 'ext': 'flv', + 'title': 'Сократ', + 'description': 'md5:a05bd01be310074d5833efc6743be95e', + 'duration': 6586, + 'age_limit': 0, }, }, { @@ -44,8 +42,7 @@ class TvigleIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) @@ -60,8 +57,8 @@ class TvigleIE(InfoExtractor): title = item['title'] description = item['description'] thumbnail = item['thumbnail'] - duration = float_or_none(item['durationMilliseconds'], 1000) - age_limit = str_to_int(item['ageRestrictions']) + duration = float_or_none(item.get('durationMilliseconds'), 1000) + age_limit = parse_age_limit(item.get('ageRestrictions')) formats = [] for vcodec, fmts in item['videos'].items(): diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index eb9473754..0157392cc 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -182,8 +182,8 @@ class TVPlayIE(InfoExtractor): 'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON') if video['is_geo_blocked']: - raise ExtractorError( - 'This content is not available in your country due to copyright reasons', expected=True) + self.report_warning( + 'This content might not be available in your country due to copyright reasons') streams = self._download_json( 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8b6e591a4..7b6179a2a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -14,23 +14,24 @@ from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter -from ..utils import ( +from ..compat import ( compat_chr, compat_parse_qs, compat_urllib_parse, compat_urllib_request, compat_urlparse, compat_str, - +) +from ..utils import ( clean_html, - get_element_by_id, - get_element_by_attribute, ExtractorError, + get_element_by_attribute, + get_element_by_id, int_or_none, OnDemandPagedList, + orderedSet, unescapeHTML, unified_strdate, - orderedSet, uppercase_escape, ) @@ -417,6 +418,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'upload_date': '20140605', }, }, + # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) + { + 'url': '__2ABJjxzNo', + 'info_dict': { + 'id': '__2ABJjxzNo', + 'ext': 'mp4', + 'upload_date': '20100430', + 'uploader_id': 'deadmau5', + 'description': 'md5:12c56784b8032162bb936a5f76d55360', + 'uploader': 'deadmau5', + 'title': 'Deadmau5 - Some Chords (HD)', + }, + 'expected_warnings': [ + 'DASH manifest missing', + ] + }, + # Olympics (https://github.com/rg3/youtube-dl/issues/4431) + { + 'url': 'lqQg6PlCWgI', + 'info_dict': { + 'id': 'lqQg6PlCWgI', + 'ext': 'mp4', + 'upload_date': '20120731', + 'uploader_id': 'olympic', + 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'uploader': 'Olympics', + 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', + }, + 'params': { + 'skip_download': 'requires avconv', + } + }, ] def __init__(self, *args, **kwargs): @@ -666,6 +699,46 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + def _parse_dash_manifest( + self, video_id, dash_manifest_url, player_url, age_gate): + def decrypt_sig(mobj): + s = mobj.group(1) + dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) + return '/signature/%s' % dec_s + dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) + dash_doc = self._download_xml( + dash_manifest_url, video_id, + note='Downloading DASH manifest', + errnote='Could not download DASH manifest') + + formats = [] + for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') + if url_el is None: + continue + format_id = r.attrib['id'] + video_url = url_el.text + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + 'fps': int_or_none(r.attrib.get('frameRate')), + } + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + f.update(self._formats.get(format_id, {})) + formats.append(f) + else: + existing_format.update(f) + return formats + def _real_extract(self, url): proto = ( 'http' if self._downloader.params.get('prefer_insecure', False) @@ -800,7 +873,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): m_cat_container = self._search_regex( r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', - video_webpage, 'categories', fatal=False) + video_webpage, 'categories', default=None) if m_cat_container: category = self._html_search_regex( r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', @@ -878,7 +951,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'url': video_info['conn'][0], 'player_url': player_url, }] - elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: + elif len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) @@ -943,51 +1016,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): - try: - # The DASH manifest used needs to be the one from the original video_webpage. - # The one found in get_video_info seems to be using different signatures. - # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage. - # Luckily, it seems, this case uses some kind of default signature (len == 86), so the - # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here. - dash_manifest_url = video_info.get('dashmpd')[0] - - def decrypt_sig(mobj): - s = mobj.group(1) - dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) - return '/signature/%s' % dec_s - dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) - dash_doc = self._download_xml( - dash_manifest_url, video_id, - note='Downloading DASH manifest', - errnote='Could not download DASH manifest') - for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): - url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') - if url_el is None: - continue - format_id = r.attrib['id'] - video_url = url_el.text - filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) - f = { - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(r.attrib.get('width')), - 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), - 'asr': int_or_none(r.attrib.get('audioSamplingRate')), - 'filesize': filesize, - 'fps': int_or_none(r.attrib.get('frameRate')), - } - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == format_id) - except StopIteration: - f.update(self._formats.get(format_id, {})) - formats.append(f) - else: - existing_format.update(f) - - except (ExtractorError, KeyError) as e: - self.report_warning('Skipping DASH manifest: %r' % e, video_id) + dash_mpd = video_info.get('dashmpd') + if dash_mpd: + dash_manifest_url = dash_mpd[0] + try: + dash_formats = self._parse_dash_manifest( + video_id, dash_manifest_url, player_url, age_gate) + except (ExtractorError, KeyError) as e: + self.report_warning( + 'Skipping DASH manifest: %r' % e, video_id) + else: + formats.extend(dash_formats) self._sort_formats(formats) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 9ff00e26c..74c76a9a0 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + OnDemandPagedList, ) @@ -87,7 +89,7 @@ def extract_from_xml_url(ie, video_id, xml_url): class ZDFIE(InfoExtractor): - _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' + _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P<id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' _TEST = { 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', @@ -106,6 +108,52 @@ class ZDFIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id return extract_from_xml_url(self, video_id, xml_url) + + +class ZDFChannelIE(InfoExtractor): + _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/)(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', + 'info_dict': { + 'id': '1586442', + }, + 'playlist_count': 4, + } + _PAGE_SIZE = 50 + + def _fetch_page(self, channel_id, page): + offset = page * self._PAGE_SIZE + xml_url = ( + 'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' + % (offset, self._PAGE_SIZE, channel_id)) + doc = self._download_xml( + xml_url, channel_id, + note='Downloading channel info', + errnote='Failed to download channel info') + + title = doc.find('.//information/title').text + description = doc.find('.//information/detail').text + for asset in doc.findall('.//teasers/teaser'): + a_type = asset.find('./type').text + a_id = asset.find('./details/assetId').text + if a_type not in ('video', 'topic'): + continue + yield { + '_type': 'url', + 'playlist_title': title, + 'playlist_description': description, + 'url': 'zdf:%s:%s' % (a_type, a_id), + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': channel_id, + 'entries': entries, + } |