diff options
Diffstat (limited to 'youtube_dl/extractor')
51 files changed, 1234 insertions, 376 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e4c51f238..f544e87f1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -71,6 +71,7 @@ from .cnn import ( from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE +from .commonmistakes import CommonMistakesIE from .condenast import CondeNastIE from .cracked import CrackedIE from .criterion import CriterionIE @@ -158,6 +159,7 @@ from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE from .giantbomb import GiantBombIE +from .giga import GigaIE from .glide import GlideIE from .globo import GloboIE from .godtube import GodTubeIE @@ -272,6 +274,7 @@ from .nbc import ( ) from .ndr import NDRIE from .ndtv import NDTVIE +from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE from .newgrounds import NewgroundsIE from .newstube import NewstubeIE @@ -324,6 +327,7 @@ from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .quickvid import QuickVidIE from .radiode import RadioDeIE +from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE @@ -344,6 +348,7 @@ from .ruhd import RUHDIE from .rutube import ( RutubeIE, RutubeChannelIE, + RutubeEmbedIE, RutubeMovieIE, RutubePersonIE, ) @@ -473,6 +478,7 @@ from .videott import VideoTtIE from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vidzi import VidziIE +from .vier import VierIE, VierVideosIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -508,6 +514,7 @@ from .wdr import ( WDRMobileIE, WDRMausIE, ) +from .webofstories import WebOfStoriesIE from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE @@ -543,7 +550,7 @@ from .youtube import ( YoutubeSearchURLIE, YoutubeShowIE, YoutubeSubscriptionsIE, - YoutubeTopListIE, + YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, YoutubeWatchLaterIE, @@ -569,6 +576,17 @@ def gen_extractors(): return [klass() for klass in _ALL_CLASSES] +def list_extractors(age_limit): + """ + Return a list of extractors that are suitable for the given age, + sorted by extractor ID. + """ + + return sorted( + filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), + key=lambda ie: ie.IE_NAME.lower()) + + def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 014a21952..a1b666be0 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse from ..utils import ( determine_ext, ExtractorError, + remove_end, ) @@ -27,23 +28,18 @@ class AUEngineIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title') - title = title.strip() - links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage) - links = map(compat_urllib_parse.unquote, links) - - thumbnail = None - video_url = None - for link in links: - if link.endswith('.png'): - thumbnail = link - elif '/videos/' in link: - video_url = link + title = self._html_search_regex( + r'<title>\s*(?P<title>.+?)\s*</title>', webpage, 'title') + video_urls = re.findall(r'http://\w+.auengine.com/vod/.*[^\W]', webpage) + video_url = compat_urllib_parse.unquote(video_urls[0]) + thumbnails = re.findall(r'http://\w+.auengine.com/thumb/.*[^\W]', webpage) + thumbnail = compat_urllib_parse.unquote(thumbnails[0]) + if not video_url: raise ExtractorError('Could not find video URL') + ext = '.' + determine_ext(video_url) - if ext == title[-len(ext):]: - title = title[:-len(ext)] + title = remove_end(title, ext) return { 'id': video_id, diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f690dc803..1cf48fe0d 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -10,7 +10,7 @@ from ..compat import compat_HTTPError class BBCCoUkIE(SubtitlesInfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})' _TESTS = [ { @@ -18,8 +18,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Kaleidoscope: Leonard Cohen', - 'description': 'md5:db4755d7a665ae72343779f7dacb402c', + 'title': 'Kaleidoscope, Leonard Cohen', + 'description': 'The Canadian poet and songwriter reflects on his musical career.', 'duration': 1740, }, 'params': { @@ -84,6 +84,40 @@ class BBCCoUkIE(SubtitlesInfoExtractor): # rtmp download 'skip_download': True, } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', + 'note': 'Audio', + 'info_dict': { + 'id': 'p02frcch', + 'ext': 'flv', + 'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', + 'description': 'French house superstar Madeon takes us out of the club and onto the after party.', + 'duration': 3507, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', + 'note': 'Video', + 'info_dict': { + 'id': 'p025c103', + 'ext': 'flv', + 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', + 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', + 'duration': 226, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', + 'only_matching': True, } ] @@ -241,8 +275,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor): # fallback to legacy playlist playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, + playlist_id, 'Downloading legacy playlist XML') no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') if no_items is not None: diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 003e50002..d2abd4d77 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -16,7 +16,7 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', 'info_dict': { - 'id': '417cd61c-c793-4e8e-b006-e445ecc45add', + 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471', 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', 'ext': 'flv', 'title': 'BET News Presents: A Conversation With President Obama', @@ -35,7 +35,7 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', 'info_dict': { - 'id': '4160e53b-ad41-43b1-980f-8d85f63121f4', + 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d', 'display_id': 'justice-for-ferguson-a-community-reacts', 'ext': 'flv', 'title': 'Justice for Ferguson: A Community Reacts', @@ -55,7 +55,6 @@ class BetIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) media_url = compat_urllib_parse.unquote(self._search_regex( diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 241b904a9..75d744852 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, int_or_none, unified_strdate, ) @@ -54,45 +52,38 @@ class BiliBiliIE(InfoExtractor): thumbnail = self._html_search_meta( 'thumbnailUrl', video_code, 'thumbnail', fatal=False) - player_params = compat_parse_qs(self._html_search_regex( - r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"', - webpage, 'player params')) + cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - if 'cid' in player_params: - cid = player_params['cid'][0] + lq_doc = self._download_xml( + 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, + video_id, + note='Downloading LQ video info' + ) + lq_durl = lq_doc.find('./durl') + formats = [{ + 'format_id': 'lq', + 'quality': 1, + 'url': lq_durl.find('./url').text, + 'filesize': int_or_none( + lq_durl.find('./size'), get_attr='text'), + }] - lq_doc = self._download_xml( - 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid, - video_id, - note='Downloading LQ video info' - ) - lq_durl = lq_doc.find('.//durl') - formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, + hq_doc = self._download_xml( + 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, + video_id, + note='Downloading HQ video info', + fatal=False, + ) + if hq_doc is not False: + hq_durl = hq_doc.find('./durl') + formats.append({ + 'format_id': 'hq', + 'quality': 2, + 'ext': 'flv', + 'url': hq_durl.find('./url').text, 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), - }] - - hq_doc = self._download_xml( - 'http://interface.bilibili.cn/playurl?cid=%s' % cid, - video_id, - note='Downloading HQ video info', - fatal=False, - ) - if hq_doc is not False: - hq_durl = hq_doc.find('.//durl') - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, - 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), - }) - else: - raise ExtractorError('Unsupported player parameters: %r' % (player_params,)) + hq_durl.find('./size'), get_attr='text'), + }) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index a40a1bbc4..a5d2af174 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -33,7 +33,7 @@ class BuzzFeedIE(InfoExtractor): 'skip_download': True, # Got enough YouTube download tests }, 'info_dict': { - 'description': 'Munchkin the Teddy Bear is back !', + 'description': 're:Munchkin the Teddy Bear is back ?!', 'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill', }, 'playlist': [{ @@ -42,9 +42,9 @@ class BuzzFeedIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': '© 2014 Munchkin the Shih Tzu\nAll rights reserved\nFacebook: http://facebook.com/MunchkintheShihTzu', + 'description': 're:© 2014 Munchkin the Shih Tzu', 'uploader': 'Munchkin the Shih Tzu', - 'title': 'Munchkin the Teddy Bear gets her exercise', + 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] }] diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 9873728df..11d18d74a 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,6 +5,8 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, + HEADRequest, unified_strdate, url_basename, qualities, @@ -76,6 +78,16 @@ class CanalplusIE(InfoExtractor): preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS']) + fmt_url = next(iter(media.find('VIDEOS'))).text + if '/geo' in fmt_url.lower(): + response = self._request_webpage( + HEADRequest(fmt_url), video_id, + 'Checking if the video is georestricted') + if '/blocage' in response.geturl(): + raise ExtractorError( + 'The video is not available in your country', + expected=True) + formats = [] for fmt in media.find('VIDEOS'): format_url = fmt.text diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 2f866f3ef..f70e090bb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, @@ -11,49 +11,42 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' _TESTS = [ { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', + 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { - 'id': '213512120230004', - 'ext': 'flv', - 'title': 'První republika: Španělská chřipka', - 'duration': 3107.4, + 'id': '214411058091220', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace', + 'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 3350, }, 'params': { - 'skip_download': True, # requires rtmpdump + # m3u8 download + 'skip_download': True, }, - 'skip': 'Works only from Czech Republic.', - }, - { - 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', - 'info_dict': { - 'id': '20138143440', - 'ext': 'flv', - 'title': 'Tsatsiki, maminka a policajt', - 'duration': 6754.1, - }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, - 'skip': 'Works only from Czech Republic.', }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', 'info_dict': { 'id': '14716', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'duration': 90, + 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 88.4, }, 'params': { - 'skip_download': True, # requires rtmpdump + # m3u8 download + 'skip_download': True, }, }, ] @@ -80,8 +73,9 @@ class CeskaTelevizeIE(InfoExtractor): 'requestSource': 'iVysilani', } - req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', - data=compat_urllib_parse.urlencode(data)) + req = compat_urllib_request.Request( + 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + data=compat_urllib_parse.urlencode(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('x-addr', '127.0.0.1') @@ -90,39 +84,72 @@ class CeskaTelevizeIE(InfoExtractor): playlistpage = self._download_json(req, video_id) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url'])) + playlist_url = playlistpage['url'] + if playlist_url == 'error_region': + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) req.add_header('Referer', url) - playlist = self._download_xml(req, video_id) + playlist = self._download_json(req, video_id) + item = playlist['playlist'][0] formats = [] - for i in playlist.find('smilRoot/body'): - if 'AD' not in i.attrib['id']: - base_url = i.attrib['base'] - parsedurl = compat_urllib_parse_urlparse(base_url) - duration = i.attrib['duration'] - - for video in i.findall('video'): - if video.attrib['label'] != 'AD': - format_id = video.attrib['label'] - play_path = video.attrib['src'] - vbr = int(video.attrib['system-bitrate']) - - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'vbr': vbr, - 'play_path': play_path, - 'app': parsedurl.path[1:] + '?' + parsedurl.query, - 'rtmp_live': True, - 'ext': 'flv', - }) - + for format_id, stream_url in item['streamUrls'].items(): + formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4')) self._sort_formats(formats) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + duration = float_or_none(item.get('duration')) + thumbnail = item.get('previewImageUrl') + + subtitles = {} + subs = item.get('subtitles') + if subs: + subtitles['cs'] = subs[0]['url'] + + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + + subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) + return { 'id': episode_id, - 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'), - 'duration': float(duration), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _fix_subtitles(subtitles): + """ Convert millisecond-based subtitles to SRT """ + if subtitles is None: + return subtitles # subtitles not requested + + def _msectotimecode(msec): + """ Helper utility to convert milliseconds to timecode """ + components = [] + for divider in [1000, 60, 60, 100]: + components.append(msec % divider) + msec //= divider + return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + + def _fix_subtitle(subtitle): + for line in subtitle.splitlines(): + m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) + if m: + yield m.group(1) + start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) + yield "{0} --> {1}".format(start, stop) + else: + yield line + + fixed_subtitles = {} + for k, v in subtitles.items(): + fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) + return fixed_subtitles diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6e264f687..b4cd59e43 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + age_restricted, clean_html, compiled_regex_type, ExtractorError, @@ -92,6 +93,8 @@ class InfoExtractor(object): by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) * language_preference Is this in the correct requested language? 10 if it's what the URL is about, @@ -144,6 +147,17 @@ class InfoExtractor(object): like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. age_limit: Age restriction for the video, as an integer (years) webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set @@ -591,7 +605,7 @@ class InfoExtractor(object): return self._html_search_regex( r'''(?isx)<meta (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name), + [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -875,6 +889,35 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if 'playlist' in tc: + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py new file mode 100644 index 000000000..75c06903f --- /dev/null +++ b/youtube_dl/extractor/commonmistakes.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class CommonMistakesIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'''(?x) + (?:url|URL) + ''' + + _TESTS = [{ + 'url': 'url', + 'only_matching': True, + }, { + 'url': 'URL', + 'only_matching': True, + }] + + def _real_extract(self, url): + msg = ( + 'You\'ve asked youtube-dl to download the URL "%s". ' + 'That doesn\'t make any sense. ' + 'Simply remove the parameter in your command or configuration.' + ) % url + if self._downloader.params.get('verbose'): + msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.' + raise ExtractorError(msg, expected=True) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 354046a9e..1680f532f 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -228,7 +228,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) formats = [] - for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): + for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 52c2d7ddf..d3e667528 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,47 +1,45 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + int_or_none, +) class DiscoveryIE(InfoExtractor): - _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' + _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?' _TEST = { 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'md5': 'e12614f9ee303a6ccef415cb0793eba2', + 'md5': '3c69d77d9b0d82bfd5e5932a60f26504', 'info_dict': { - 'id': '614784', - 'ext': 'mp4', - 'title': 'MythBusters: Mission Impossible Outtakes', + 'id': 'mission-impossible-outtakes', + 'ext': 'flv', + 'title': 'Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' 'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s' ' back.'), 'duration': 156, + 'timestamp': 1303099200, + 'upload_date': '20110418', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_list_json = self._search_regex(r'var videoListJSON = ({.*?});', - webpage, 'video list', flags=re.DOTALL) - video_list = json.loads(video_list_json) - info = video_list['clips'][0] - formats = [] - for f in info['mp4']: - formats.append( - {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])}) + info = self._parse_json(self._search_regex( + r'(?s)<script type="application/ld\+json">(.*?)</script>', + webpage, 'video info'), video_id) return { - 'id': info['contentId'], - 'title': video_list['name'], - 'formats': formats, - 'description': info['videoCaption'], - 'thumbnail': info.get('videoStillURL') or info.get('thumbnailURL'), - 'duration': info['duration'], + 'id': video_id, + 'title': info['name'], + 'url': info['contentURL'], + 'description': info.get('description'), + 'thumbnail': info.get('thumbnailUrl'), + 'timestamp': parse_iso8601(info.get('uploadDate')), + 'duration': int_or_none(info.get('duration')), } diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 3e7923648..fc92ff825 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re import json from .common import InfoExtractor @@ -12,32 +11,49 @@ from ..utils import ( class EllenTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' + _TESTS = [{ 'url': 'http://www.ellentv.com/videos/0-7jqrsr18/', 'md5': 'e4af06f3bf0d5f471921a18db5764642', 'info_dict': { 'id': '0-7jqrsr18', 'ext': 'mp4', 'title': 'What\'s Wrong with These Photos? A Whole Lot', + 'description': 'md5:35f152dc66b587cf13e6d2cf4fa467f6', 'timestamp': 1406876400, 'upload_date': '20140801', } - } + }, { + 'url': 'http://ellentube.com/videos/0-dvzmabd5/', + 'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb', + 'info_dict': { + 'id': '0-dvzmabd5', + 'ext': 'mp4', + 'title': '1 year old twin sister makes her brother laugh', + 'description': '1 year old twin sister makes her brother laugh', + 'timestamp': 1419542075, + 'upload_date': '20141225', + } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + video_url = self._html_search_meta('VideoURL', webpage, 'url') + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'pageName\s*=\s*"([^"]+)"', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') or self._og_search_description(webpage) timestamp = parse_iso8601(self._search_regex( r'<span class="publish-date"><time datetime="([^"]+)">', webpage, 'timestamp')) return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'url': self._html_search_meta('VideoURL', webpage, 'url'), + 'url': video_url, + 'title': title, + 'description': description, 'timestamp': timestamp, } @@ -55,8 +71,7 @@ class EllenTVClipsIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) playlist = self._extract_playlist(webpage) diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 4277202a2..00a69e631 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import unified_strdate @@ -24,9 +22,7 @@ class ElPaisIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) prefix = self._html_search_regex( diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index d09d1c13a..190d9f9ad 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -13,7 +13,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' + _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', @@ -26,29 +26,32 @@ class FKTVIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - episode = int(mobj.group('ep')) + episode = int(self._match_id(url)) - server = random.randint(2, 4) - video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode - start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, + video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode + start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode, episode) playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage, 'playlist', flags=re.DOTALL) files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) - # TODO: return a single multipart video + videos = [] for i, _ in enumerate(files, 1): video_id = '%04d%d' % (episode, i) - video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) + video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i) videos.append({ + 'ext': 'flv', 'id': video_id, 'url': video_url, 'title': clean_html(get_element_by_id('eptitle', start_webpage)), 'description': clean_html(get_element_by_id('contentlist', start_webpage)), 'thumbnail': video_thumbnail }) - return videos + return { + '_type': 'multi_video', + 'entries': videos, + 'id': 'folge-%s' % episode, + } class FKTVPosteckeIE(InfoExtractor): diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py index 75f180928..a07d69841 100644 --- a/youtube_dl/extractor/gameone.py +++ b/youtube_dl/extractor/gameone.py @@ -57,8 +57,7 @@ class GameOneIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage, secure=False) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index d453ec010..fed968f51 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -39,7 +39,8 @@ class GDCVaultIE(InfoExtractor): 'id': '1015301', 'ext': 'flv', 'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', - } + }, + 'skip': 'Requires login', } ] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40b2791c7..7a5bf9392 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -131,12 +131,13 @@ class GenericIE(InfoExtractor): # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', - 'md5': '5644c6ca5d5782c1d0d350dad9bd840c', + 'md5': '166dd577b433b4d4ebfee10b0824d8ff', 'info_dict': { 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get }, + 'add_ie': ['Ooyala'], }, # google redirect { @@ -146,7 +147,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', - 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.', + 'description': 're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, @@ -181,6 +182,14 @@ class GenericIE(InfoExtractor): 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, }, + # BBC iPlayer embeds + { + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', + 'info_dict': { + 'title': 'BBC - Blogs - Adam Curtis - BUGGER', + }, + 'playlist_mincount': 18, + }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', @@ -699,9 +708,9 @@ class GenericIE(InfoExtractor): r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') # Helper method - def _playlist_from_matches(matches, getter, ie=None): + def _playlist_from_matches(matches, getter=None, ie=None): urlrs = orderedSet( - self.url_result(self._proto_relative_url(getter(m)), ie) + self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) for m in matches) return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) @@ -905,6 +914,11 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( matches, getter=unescapeHTML, ie='FunnyOrDie') + # Look for BBC iPlayer embed + matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) + if matches: + return _playlist_from_matches(matches, ie='BBCCoUk') + # Look for embedded RUTV player rutv_url = RUTVIE._extract_url(webpage) if rutv_url: @@ -912,7 +926,7 @@ class GenericIE(InfoExtractor): # Look for embedded TED player mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage) + r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'TED') diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py new file mode 100644 index 000000000..775890112 --- /dev/null +++ b/youtube_dl/extractor/giga.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + qualities, + compat_str, + parse_duration, + parse_iso8601, + str_to_int, +) + + +class GigaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/', + 'md5': '6bc5535e945e724640664632055a584f', + 'info_dict': { + 'id': '2622086', + 'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss', + 'ext': 'mp4', + 'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', + 'description': 'md5:afdf5862241aded4718a30dff6a57baf', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 578, + 'timestamp': 1414749706, + 'upload_date': '20141031', + 'uploader': 'Robin Schweiger', + 'view_count': int, + }, + }, { + 'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/', + 'only_matching': True, + }, { + 'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/', + 'only_matching': True, + }, { + 'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'], + webpage, 'video id') + + playlist = self._download_json( + 'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/' + % video_id, video_id)[0] + + quality = qualities(['normal', 'hd720']) + + formats = [] + for format_id in itertools.count(0): + fmt = playlist.get(compat_str(format_id)) + if not fmt: + break + formats.append({ + 'url': fmt['src'], + 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), + 'quality': quality(fmt['quality']), + }) + self._sort_formats(formats) + + title = self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) + + duration = parse_duration(self._search_regex( + r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id), + webpage, 'duration', fatal=False)) + + timestamp = parse_iso8601(self._search_regex( + r'datetime="([^"]+)"', webpage, 'upload date', fatal=False)) + uploader = self._search_regex( + r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False) + + view_count = str_to_int(self._search_regex( + r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False)) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 4ccf6b9b8..a38eae421 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -39,8 +39,9 @@ class HuffPostIE(InfoExtractor): data = self._download_json(api_url, video_id)['data'] video_title = data['title'] - duration = parse_duration(data['running_time']) - upload_date = unified_strdate(data['schedule']['starts_at']) + duration = parse_duration(data.get('running_time')) + upload_date = unified_strdate( + data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time')) description = data.get('description') thumbnails = [] @@ -59,16 +60,11 @@ class HuffPostIE(InfoExtractor): 'ext': 'mp4', 'url': url, 'vcodec': 'none' if key.startswith('audio/') else None, - } for key, url in data['sources']['live'].items()] - if data.get('fivemin_id'): - fid = data['fivemin_id'] - fcat = str(int(fid) // 100 + 1) - furl = 'http://avideos.5min.com/2/' + fcat[-3:] + '/' + fcat + '/' + fid + '.mp4' - formats.append({ - 'format': 'fivemin', - 'url': furl, - 'preference': 1, - }) + } for key, url in data.get('sources', {}).get('live', {}).items()] + + if not formats and data.get('fivemin_id'): + return self.url_result('5min:%s' % data['fivemin_id']) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 13a53a0cb..f29df36b5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -16,7 +16,6 @@ class ImdbIE(InfoExtractor): _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', - 'md5': '9f34fa777ade3a6e57a054fdbcb3a068', 'info_dict': { 'id': '2524815897', 'ext': 'mp4', diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py index 408d00944..08a671fa8 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/youtube_dl/extractor/khanacademy.py @@ -22,8 +22,10 @@ class KhanAcademyIE(InfoExtractor): 'description': 'The perfect cipher', 'duration': 176, 'uploader': 'Brit Cruise', + 'uploader_id': 'khanacademy', 'upload_date': '20120411', - } + }, + 'add_ie': ['Youtube'], }, { 'url': 'https://www.khanacademy.org/math/applied-math/cryptography', 'info_dict': { diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 41fd62009..720bc939b 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -10,13 +10,14 @@ from ..utils import int_or_none class KontrTubeIE(InfoExtractor): IE_NAME = 'kontrtube' IE_DESC = 'KontrTube.ru - Труба зовёт' - _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+' + _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' _TEST = { 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/', 'md5': '975a991a4926c9a85f383a736a2e6b80', 'info_dict': { 'id': '2678', + 'display_id': 'nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag', 'ext': 'mp4', 'title': 'Над олимпийской деревней в Сочи поднят российский флаг', 'description': 'md5:80edc4c613d5887ae8ccf1d59432be41', @@ -28,21 +29,28 @@ class KontrTubeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id, 'Downloading page') + webpage = self._download_webpage( + url, display_id, 'Downloading page') - video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL') - thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False) + video_url = self._html_search_regex( + r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL') + thumbnail = self._html_search_regex( + r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False) title = self._html_search_regex( r'<title>(.+?)</title>', webpage, 'video title') - description = self._html_search_meta('description', webpage, 'video description') + description = self._html_search_meta( + 'description', webpage, 'video description') mobj = re.search( - r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage) + r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', + webpage) duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None view_count = self._html_search_regex( - r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False) + r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', + webpage, 'view count', fatal=False) comment_count = None comment_str = self._html_search_regex( @@ -56,6 +64,7 @@ class KontrTubeIE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'url': video_url, 'thumbnail': thumbnail, 'title': title, diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index d72d470aa..9c2fbdd96 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( @@ -28,7 +27,6 @@ class LRTIE(InfoExtractor): 'params': { 'skip_download': True, # HLS download }, - } def _real_extract(self, url): @@ -44,7 +42,9 @@ class LRTIE(InfoExtractor): formats = [] for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): - data = json.loads(js_to_json(js)) + data = self._parse_json(js, video_id, transform_source=js_to_json) + if 'provider' not in data: + continue if data['provider'] == 'rtmp': formats.append({ 'format_id': 'rtmp', diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 78787e8f1..3c61a850f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -105,6 +105,9 @@ class OCWMITIE(InfoExtractor): 'ext': 'mp4', 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', + 'upload_date': '20121109', + 'uploader_id': 'MIT', + 'uploader': 'MIT OpenCourseWare', # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' } }, @@ -114,6 +117,9 @@ class OCWMITIE(InfoExtractor): 'id': '7K1sB05pE0A', 'ext': 'mp4', 'title': 'Session 1: Introduction to Derivatives', + 'upload_date': '20090818', + 'uploader_id': 'MIT', + 'uploader': 'MIT OpenCourseWare', 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' } diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index f5ca74e97..c1a482dba 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -1,63 +1,49 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib -import json -import time - from .common import InfoExtractor from ..compat import ( - compat_parse_qs, - compat_str, -) -from ..utils import ( - int_or_none, + compat_urlparse, ) class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])' + _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', - 'md5': '5592cb7c5005d9b2c163df5ac3dc04e4', 'info_dict': { - 'id': '7063', + 'id': '2-T3WuR-KMM', 'ext': 'mp4', 'title': 'Red Bull Racing: 2014 Rules Explained', - 'duration': 207, + 'duration': 208, 'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.', - 'uploader': 'rainiere', - 'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$' - } + 'uploader': 'mcomstaff', + 'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ', + 'upload_date': '20140903', + 'thumbnail': r're:^https?://.+\.jpg$' + }, + 'add_ie': ['Youtube'], + 'params': { + 'skip_download': True, + }, } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - flashvars_code = self._html_search_regex( - r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars') - flashvars = compat_parse_qs(flashvars_code) - params = json.loads(flashvars['parameters'][0]) - - e = compat_str(int(time.time()) + 24 * 60 * 60) - base_video_url = params['location'] + '?e=' + e - s = 'h3hg713fh32' - h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest() - video_url = base_video_url + '&h=' + h - - uploader = self._html_search_regex( - r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage, - 'uploader', fatal=False) + iframe_path = self._html_search_regex( + r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, + 'iframe path') + iframe = self._download_webpage( + compat_urlparse.urljoin(url, iframe_path), display_id, + 'Downloading iframe') + youtube_id = self._search_regex( + r'www.youtube.com/embed/(.{11})', iframe, 'youtube id') return { - 'id': params['video_id'], + '_type': 'url_transparent', 'display_id': display_id, - 'title': params['title'], - 'url': video_url, - 'description': params.get('description'), - 'thumbnail': params.get('main_thumb'), - 'duration': int_or_none(params.get('duration')), - 'uploader': uploader, + 'url': 'https://youtube.com/watch?v=%s' % youtube_id, } diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py new file mode 100644 index 000000000..93567d1e3 --- /dev/null +++ b/youtube_dl/extractor/netzkino.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + js_to_json, + parse_iso8601, +) + + +class NetzkinoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)' + + _TEST = { + 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', + 'md5': '92a3f8b76f8d7220acce5377ea5d4873', + 'info_dict': { + 'id': 'rakete-zum-mond', + 'ext': 'mp4', + 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', + 'comments': 'mincount:3', + 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', + 'upload_date': '20120813', + 'thumbnail': 're:https?://.*\.jpg$', + 'timestamp': 1344858571, + 'age_limit': 12, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category_id = mobj.group('category') + video_id = mobj.group('id') + + api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id + api_info = self._download_json(api_url, video_id) + info = next( + p for p in api_info['posts'] if p['slug'] == video_id) + custom_fields = info['custom_fields'] + + production_js = self._download_webpage( + 'http://www.netzkino.de/beta/dist/production.min.js', video_id, + note='Downloading player code') + avo_js = self._search_regex( + r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})', + production_js, 'URL templates') + templates = self._parse_json( + avo_js, video_id, transform_source=js_to_json) + + suffix = { + 'hds': '.mp4/manifest.f4m', + 'hls': '.mp4/master.m3u8', + 'pmd': '.mp4', + } + film_fn = custom_fields['Streaming'][0] + formats = [{ + 'format_id': key, + 'ext': 'mp4', + 'url': tpl.replace('{}', film_fn) + suffix[key], + } for key, tpl in templates.items()] + self._sort_formats(formats) + + comments = [{ + 'timestamp': parse_iso8601(c.get('date'), delimiter=' '), + 'id': c['id'], + 'author': c['name'], + 'html': c['content'], + 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], + } for c in info.get('comments', [])] + + return { + 'id': video_id, + 'formats': formats, + 'comments': comments, + 'title': info['title'], + 'age_limit': int_or_none(custom_fields.get('FSK')[0]), + 'timestamp': parse_iso8601(info.get('date'), delimiter=' '), + 'description': clean_html(info.get('content')), + 'thumbnail': info.get('thumbnail'), + 'playlist_title': api_info.get('title'), + 'playlist_id': category_id, + } diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 3d35b11ac..c13ff0d65 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -22,7 +22,11 @@ class NormalbootsIE(InfoExtractor): 'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/', 'uploader': 'JonTron', 'upload_date': '20140125', - } + }, + 'params': { + # rtmp download + 'skip_download': True, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 43e8e619f..321ce5ce7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,7 +72,7 @@ class NRKIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { @@ -85,7 +85,7 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'upload_date': '20140523', 'duration': 1741.52, - } + }, }, { 'url': 'http://tv.nrk.no/program/mdfp15000514', @@ -97,39 +97,119 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', 'upload_date': '20140524', 'duration': 4605.0, - } + }, }, + { + # single playlist video + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + 'skip': 'Only works from Norway', + }, + { + 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [ + { + 'md5': '9480285eff92d64f06e02a5367970a7a', + 'info_dict': { + 'id': 'MSPO40010515-part1', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + { + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, + ], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + 'duration': 6947.5199999999995, + }, + 'skip': 'Only works from Norway', + } ] + def _extract_f4m(self, manifest_url, video_id): + return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - - page = self._download_webpage(url, video_id) - - title = self._html_search_meta('title', page, 'title') - description = self._html_search_meta('description', page, 'description') - thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) - upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) - duration = float_or_none( - self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)) + part_id = mobj.group('part_id') + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'title', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = self._html_search_regex( + r'data-posterimage="([^"]+)"', + webpage, 'thumbnail', fatal=False) + upload_date = unified_strdate(self._html_search_meta( + 'rightsfrom', webpage, 'upload date', fatal=False)) + duration = float_or_none(self._html_search_regex( + r'data-duration="([^"]+)"', + webpage, 'duration', fatal=False)) + + # playlist + parts = re.findall( + r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) + if parts: + entries = [] + for current_part_id, stream_url, part_title in parts: + if part_id and current_part_id != part_id: + continue + video_part_id = '%s-part%s' % (video_id, current_part_id) + formats = self._extract_f4m(stream_url, video_part_id) + entries.append({ + 'id': video_part_id, + 'title': part_title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + }) + if part_id: + if entries: + return entries[0] + else: + playlist = self.playlist_result(entries, video_id, title, description) + playlist.update({ + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + }) + return playlist formats = [] - f4m_url = re.search(r'data-media="([^"]+)"', page) + f4m_url = re.search(r'data-media="([^"]+)"', webpage) if f4m_url: - formats.append({ - 'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - 'format_id': 'f4m', - 'ext': 'flv', - }) + formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - m3u8_url = re.search(r'data-hls-media="([^"]+)"', page) + m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) if m3u8_url: - formats.append({ - 'url': m3u8_url.group(1), - 'format_id': 'm3u8', - }) + formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4')) self._sort_formats(formats) diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 449d4836c..45716c75d 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -26,6 +26,7 @@ class PlayedIE(InfoExtractor): 'ext': 'flv', 'title': 'youtube-dl_test_video.mp4', }, + 'skip': 'Removed for copyright infringement.', # oh wow } def _real_extract(self, url): diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py new file mode 100644 index 000000000..0d706312e --- /dev/null +++ b/youtube_dl/extractor/radiobremen.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class RadioBremenIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)' + IE_NAME = 'radiobremen' + + _TEST = { + 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', + 'info_dict': { + 'id': '114720', + 'ext': 'mp4', + 'duration': 1685, + 'width': 512, + 'title': 'buten un binnen vom 22. Dezember', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + meta_url = "http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s" % video_id + meta_doc = self._download_webpage( + meta_url, video_id, 'Downloading metadata') + title = self._html_search_regex( + r"<h1.*>(?P<title>.+)</h1>", meta_doc, "title") + description = self._html_search_regex( + r"<p>(?P<description>.*)</p>", meta_doc, "description", fatal=False) + duration = parse_duration(self._html_search_regex( + r"Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>", + meta_doc, "duration", fatal=False)) + + page_doc = self._download_webpage( + url, video_id, 'Downloading video information') + mobj = re.search( + r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", + page_doc) + video_url = ( + "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % + (video_id, video_id, mobj.group("secret"), mobj.group('width'))) + + formats = [{ + 'url': video_url, + 'ext': 'mp4', + 'width': int(mobj.group("width")), + }] + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'thumbnail': mobj.group('thumbnail'), + } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index d029b0ec5..a3ca79f2c 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -8,7 +8,7 @@ from ..utils import parse_duration class RtlXlIE(InfoExtractor): IE_NAME = 'rtlxl.nl' - _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' + _VALID_URL = r'https?://(www\.)?rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' _TEST = { 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index b72b5a586..5b1c3577a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -70,6 +70,37 @@ class RutubeIE(InfoExtractor): } +class RutubeEmbedIE(InfoExtractor): + IE_NAME = 'rutube:embed' + IE_DESC = 'Rutube embedded videos' + _VALID_URL = 'https?://rutube\.ru/video/embed/(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', + 'info_dict': { + 'id': 'a10e53b86e8f349080f718582ce4c661', + 'ext': 'mp4', + 'upload_date': '20131223', + 'uploader_id': '297833', + 'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', + 'uploader': 'subziro89 ILya', + 'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', + }, + 'params': { + 'skip_download': 'Requires ffmpeg', + }, + } + + def _real_extract(self, url): + embed_id = self._match_id(url) + webpage = self._download_webpage(url, embed_id) + + canonical_url = self._html_search_regex( + r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, + 'Canonical URL') + return self.url_result(canonical_url, 'Rutube') + + class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py index c833fc8ee..6446d26dc 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/sexykarma.py @@ -24,7 +24,7 @@ class SexyKarmaIE(InfoExtractor): 'title': 'Taking a quick pee.', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'wildginger7', - 'upload_date': '20141007', + 'upload_date': '20141008', 'duration': 22, 'view_count': int, 'comment_count': int, @@ -45,6 +45,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }, { 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', @@ -61,6 +62,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }] @@ -114,4 +116,5 @@ class SexyKarmaIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'categories': categories, + 'age_limit': 18, } diff --git a/youtube_dl/extractor/soulanime.py b/youtube_dl/extractor/soulanime.py new file mode 100644 index 000000000..feef33e27 --- /dev/null +++ b/youtube_dl/extractor/soulanime.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + urlhandle_detect_ext, +) + + +class SoulAnimeWatchingIE(InfoExtractor): + IE_NAME = "soulanime:watching" + IE_DESC = "SoulAnime video" + _TEST = { + 'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/', + 'md5': '05fae04abf72298098b528e98abf4298', + 'info_dict': { + 'id': 'seirei-tsukai-no-blade-dance-episode-9', + 'ext': 'mp4', + 'title': 'seirei-tsukai-no-blade-dance-episode-9', + 'description': 'seirei-tsukai-no-blade-dance-episode-9' + } + } + _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/watch[^/]*/(?P<id>[^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + domain = mobj.group('domain') + + page = self._download_webpage(url, video_id) + + video_url_encoded = self._html_search_regex( + r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"', page, 'url') + video_url = "http://www.soul-anime." + domain + video_url_encoded + + ext_req = HEADRequest(video_url) + ext_handle = self._request_webpage( + ext_req, video_id, note='Determining extension') + ext = urlhandle_detect_ext(ext_handle) + + return { + 'id': video_id, + 'url': video_url, + 'ext': ext, + 'title': video_id, + 'description': video_id + } + + +class SoulAnimeSeriesIE(InfoExtractor): + IE_NAME = "soulanime:series" + IE_DESC = "SoulAnime Series" + + _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/anime./(?P<id>[^/]+)' + + _EPISODE_REGEX = r'<option value="(/watch[^/]*/[^"]+)">[^<]*</option>' + + _TEST = { + 'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/', + 'info_dict': { + 'id': 'black-rock-shooter-tv' + }, + 'playlist_count': 8 + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + series_id = mobj.group('id') + domain = mobj.group('domain') + + pattern = re.compile(self._EPISODE_REGEX) + + page = self._download_webpage(url, series_id, "Downloading series page") + mobj = pattern.findall(page) + + entries = [self.url_result("http://www.soul-anime." + domain + obj) for obj in mobj] + + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 6c3445d79..82675431f 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -57,9 +57,7 @@ class TeacherTubeIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta('title', webpage, 'title', fatal=True) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 944177426..10b3b706a 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -13,7 +13,7 @@ from ..compat import ( class TEDIE(SubtitlesInfoExtractor): _VALID_URL = r'''(?x) (?P<proto>https?://) - (?P<type>www|embed)(?P<urlmain>\.ted\.com/ + (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/ ( (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist | @@ -98,7 +98,7 @@ class TEDIE(SubtitlesInfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) - if m.group('type') == 'embed': + if m.group('type').startswith('embed'): desktop_url = m.group('proto') + 'www' + m.group('urlmain') return self.url_result(desktop_url, 'TED') name = m.group('name') diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 6e61cc9e2..025d0877c 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,15 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html' - _TEST = { + _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' + _TESTS = { 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { 'id': '10635995', @@ -21,14 +19,26 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, + }, { + 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', + 'info_dict': { + 'id': '12043945', + 'ext': 'mp4', + 'title': 'Le grand Mystérioso - Chuggington', + 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', + 'upload_date': '20150103', + }, + 'params': { + # Sometimes wat serves the whole file with the --test option + 'skip_download': True, + }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) embed_url = self._html_search_regex( - r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url') + r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url') embed_page = self._download_webpage(embed_url, video_id, 'Downloading embed player page') wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 161e47624..c89de5ba4 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -9,7 +9,7 @@ from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', @@ -27,13 +27,6 @@ class TudouIE(InfoExtractor): 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', 'thumbnail': 're:^https?://.*\.jpg$', } - }, { - 'url': 'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html', - 'info_dict': { - 'title': 'todo.mp4', - }, - 'add_ie': ['Youku'], - 'skip': 'Only works from China' }] def _url_for_id(self, id, quality=None): @@ -45,8 +38,7 @@ class TudouIE(InfoExtractor): return final_url def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(2) + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) @@ -87,4 +79,9 @@ class TudouIE(InfoExtractor): } result.append(part_info) - return result + return { + '_type': 'multi_video', + 'entries': result, + 'id': video_id, + 'title': title, + } diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 4ce5aeeba..b6b1f2568 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -24,7 +24,7 @@ class TuneInIE(InfoExtractor): _INFO_DICT = { 'id': '34682', 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', - 'ext': 'AAC', + 'ext': 'aac', 'thumbnail': 're:^https?://.*\.png$', 'location': 'Tacoma, WA', } @@ -78,14 +78,21 @@ class TuneInIE(InfoExtractor): for stream in streams: if stream.get('Type') == 'Live': is_live = True + reliability = stream.get('Reliability') + format_note = ( + 'Reliability: %d%%' % reliability + if reliability is not None else None) formats.append({ + 'preference': ( + 0 if reliability is None or reliability > 90 + else 1), 'abr': stream.get('Bandwidth'), - 'ext': stream.get('MediaType'), + 'ext': stream.get('MediaType').lower(), 'acodec': stream.get('MediaType'), 'vcodec': 'none', 'url': stream.get('Url'), - # Sometimes streams with the highest quality do not exist - 'preference': stream.get('Reliability'), + 'source_preference': reliability, + 'format_note': format_note, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py new file mode 100644 index 000000000..619039e51 --- /dev/null +++ b/youtube_dl/extractor/vier.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class VierIE(InfoExtractor): + IE_NAME = 'vier' + _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' + _TESTS = [{ + 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'info_dict': { + 'id': '16129', + 'display_id': 'het-wordt-warm-de-moestuin', + 'ext': 'mp4', + 'title': 'Het wordt warm in De Moestuin', + 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', + 'only_matching': True, + }, { + 'url': 'http://www.vier.be/video/v3/embed/16129', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + embed_id = mobj.group('embed_id') + display_id = mobj.group('display_id') or embed_id + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'"nid"\s*:\s*"(\d+)"', webpage, 'video id') + application = self._search_regex( + r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod') + filename = self._search_regex( + r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename') + + playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) + formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') + + title = self._og_search_title(webpage, default=display_id) + description = self._og_search_description(webpage, default=None) + thumbnail = self._og_search_thumbnail(webpage, default=None) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } + + +class VierVideosIE(InfoExtractor): + IE_NAME = 'vier:videos' + _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' + _TESTS = [{ + 'url': 'http://www.vier.be/demoestuin/videos', + 'info_dict': { + 'id': 'demoestuin', + }, + 'playlist_mincount': 153, + }, { + 'url': 'http://www.vier.be/demoestuin/videos?page=6', + 'info_dict': { + 'id': 'demoestuin-page6', + }, + 'playlist_mincount': 20, + }, { + 'url': 'http://www.vier.be/demoestuin/videos?page=7', + 'info_dict': { + 'id': 'demoestuin-page7', + }, + 'playlist_mincount': 13, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + program = mobj.group('program') + + webpage = self._download_webpage(url, program) + + page_id = mobj.group('page') + if page_id: + page_id = int(page_id) + start_page = page_id + last_page = start_page + 1 + playlist_id = '%s-page%d' % (program, page_id) + else: + start_page = 0 + last_page = int(self._search_regex( + r'videos\?page=(\d+)">laatste</a>', + webpage, 'last page', default=0)) + 1 + playlist_id = program + + entries = [] + for current_page_id in range(start_page, last_page): + current_page = self._download_webpage( + 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), + program, + 'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage + page_entries = [ + self.url_result('http://www.vier.be' + video_url, 'Vier') + for video_url in re.findall( + r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] + entries.extend(page_entries) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 15f315298..944901e14 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -17,7 +17,6 @@ class VikiIE(SubtitlesInfoExtractor): _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' _TEST = { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', - 'md5': 'a21454021c2646f5433514177e2caa5f', 'info_dict': { 'id': '1023585v', 'ext': 'mp4', @@ -31,8 +30,7 @@ class VikiIE(SubtitlesInfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index 33d370e1c..ee3d86117 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -14,28 +14,17 @@ class VimpleIE(InfoExtractor): IE_DESC = 'Vimple.ru' _VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})' _TESTS = [ - # Quality: Large, from iframe { - 'url': 'http://player.vimple.ru/iframe/b132bdfd71b546d3972f9ab9a25f201c', + 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', + 'md5': '2e750a330ed211d3fd41821c6ad9a279', 'info_dict': { - 'id': 'b132bdfd71b546d3972f9ab9a25f201c', - 'title': 'great-escape-minecraft.flv', + 'id': 'c0f6b1687dcd4000a97ebe70068039cf', 'ext': 'mp4', - 'duration': 352, - 'webpage_url': 'http://vimple.ru/b132bdfd71b546d3972f9ab9a25f201c', + 'title': 'Sunset', + 'duration': 20, + 'thumbnail': 're:https?://.*?\.jpg', }, }, - # Quality: Medium, from mainpage - { - 'url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd', - 'info_dict': { - 'id': 'a15950562888453b8e6f9572dc8600cd', - 'title': 'DB 01', - 'ext': 'flv', - 'duration': 1484, - 'webpage_url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd', - } - }, ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 542e9198a..81e02a624 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -164,6 +164,14 @@ class VKIE(InfoExtractor): self.to_screen('Youtube video detected') return self.url_result(m_yt.group(1), 'Youtube') + m_rutube = re.search( + r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) + if m_rutube is not None: + self.to_screen('rutube video detected') + rutube_url = self._proto_relative_url( + m_rutube.group(1).replace('\\', '')) + return self.url_result(rutube_url) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) if m_opts: m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 88bbbb219..c17bebd6e 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -10,14 +10,14 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' _TEST = { 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/', 'info_dict': { 'title': 'Sinkhole of bureaucracy', }, 'playlist': [{ - 'md5': 'c3f4b4922ffa259243f68e928db2db8c', + 'md5': '79132cc09ec5309fa590ae46e4cc31bc', 'info_dict': { 'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', @@ -29,7 +29,7 @@ class WashingtonPostIE(InfoExtractor): 'upload_date': '20140322', }, }, { - 'md5': 'f645a07652c2950cd9134bb852c5f5eb', + 'md5': 'e1d5734c06865cc504ad99dc2de0d443', 'info_dict': { 'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f', 'ext': 'mp4', @@ -44,10 +44,9 @@ class WashingtonPostIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - + page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) + title = self._og_search_title(webpage) uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage) entries = [] diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 8e25ecf28..45466e31b 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -67,6 +68,10 @@ class WDRIE(InfoExtractor): 'upload_date': '20140717', }, }, + { + 'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', + 'playlist_mincount': 146, + } ] def _real_extract(self, url): @@ -81,6 +86,27 @@ class WDRIE(InfoExtractor): self.url_result(page_url + href, 'WDR') for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage) ] + + if entries: # Playlist page + return self.playlist_result(entries, page_id) + + # Overview page + entries = [] + for page_num in itertools.count(2): + hrefs = re.findall( + r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"', + webpage) + entries.extend( + self.url_result(page_url + href, 'WDR') + for href in hrefs) + next_url_m = re.search( + r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage) + if not next_url_m: + break + next_url = page_url + next_url_m.group(1) + webpage = self._download_webpage( + next_url, page_id, + note='Downloading playlist page %d' % page_num) return self.playlist_result(entries, page_id) flashvars = compat_parse_qs( @@ -172,8 +198,7 @@ class WDRMausIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) param_code = self._html_search_regex( @@ -224,5 +249,3 @@ class WDRMausIE(InfoExtractor): 'thumbnail': thumbnail, 'upload_date': upload_date, } - -# TODO test _1 diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py new file mode 100644 index 000000000..396cf4e83 --- /dev/null +++ b/youtube_dl/extractor/webofstories.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class WebOfStoriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)' + _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' + _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' + _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' + _TESTS = [ + { + 'url': 'http://www.webofstories.com/play/hans.bethe/71', + 'md5': '373e4dd915f60cfe3116322642ddf364', + 'info_dict': { + 'id': '4536', + 'ext': 'mp4', + 'title': 'The temperature of the sun', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Hans Bethe talks about calculating the temperature of the sun', + 'duration': 238, + } + }, + { + 'url': 'http://www.webofstories.com/play/55908', + 'md5': '2985a698e1fe3211022422c4b5ed962c', + 'info_dict': { + 'id': '55908', + 'ext': 'mp4', + 'title': 'The story of Gemmata obscuriglobus', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', + 'duration': 169, + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + description = self._html_search_meta('description', webpage) + thumbnail = self._og_search_thumbnail(webpage) + + story_filename = self._search_regex( + r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename') + speaker_id = self._search_regex( + r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID') + story_id = self._search_regex( + r'\.storyId\((\d+)\)', webpage, 'story ID') + speaker_type = self._search_regex( + r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type') + great_life = self._search_regex( + r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story') + is_great_life_series = great_life == 'true' + duration = int_or_none(self._search_regex( + r'\.duration\((\d+)\)', webpage, 'duration', fatal=False)) + + # URL building, see: http://www.webofstories.com/scripts/player.js + ms_prefix = '' + if speaker_type.lower() == 'ms': + ms_prefix = 'mini_sites/' + + if is_great_life_series: + mp4_url = '{0:}lives/{1:}/{2:}.mp4'.format( + self._VIDEO_DOMAIN, speaker_id, story_filename) + rtmp_ext = 'flv' + streamer = self._GREAT_LIFE_STREAMER + play_path = 'stories/{0:}/{1:}'.format( + speaker_id, story_filename) + else: + mp4_url = '{0:}{1:}{2:}/{3:}.mp4'.format( + self._VIDEO_DOMAIN, ms_prefix, speaker_id, story_filename) + rtmp_ext = 'mp4' + streamer = self._USER_STREAMER + play_path = 'mp4:{0:}{1:}/{2}.mp4'.format( + ms_prefix, speaker_id, story_filename) + + formats = [{ + 'format_id': 'mp4_sd', + 'url': mp4_url, + }, { + 'format_id': 'rtmp_sd', + 'page_url': url, + 'url': streamer, + 'ext': rtmp_ext, + 'play_path': play_path, + }] + + self._sort_formats(formats) + + return { + 'id': story_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 6b37bcbc9..4527567f8 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -14,7 +14,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" - _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' + _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' _TESTS = [ { 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', @@ -39,7 +39,11 @@ class XHamsterIE(InfoExtractor): 'duration': 200, 'age_limit': 18, } - } + }, + { + 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -57,7 +61,8 @@ class XHamsterIE(InfoExtractor): video_id = mobj.group('id') seo = mobj.group('seo') - mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) + proto = mobj.group('proto') + mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo) webpage = self._download_webpage(mrss_url, video_id) title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title') diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index cf74d4fd5..e8490b028 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -40,7 +40,7 @@ class XTubeIE(InfoExtractor): r'<p class="title">([^<]+)', webpage, 'title') video_uploader = self._html_search_regex( [r"var\s+contentOwnerId\s*=\s*'([^']+)", - r'By:\s*<a href="/community/profile\.php?user=([^"]+)'], + r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'], webpage, 'uploader', fatal=False) video_description = self._html_search_regex( r'<p class="fieldsDesc">([^<]+)', @@ -95,6 +95,7 @@ class XTubeUserIE(InfoExtractor): 'url': 'http://www.xtube.com/community/profile.php?user=greenshowers', 'info_dict': { 'id': 'greenshowers', + 'age_limit': 18, }, 'playlist_mincount': 155, } @@ -124,6 +125,7 @@ class XTubeUserIE(InfoExtractor): return { '_type': 'playlist', 'id': username, + 'age_limit': 18, 'entries': [{ '_type': 'url', 'url': eurl, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7f5aeb25b..bc18276d6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -256,7 +256,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, @@ -264,9 +264,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, @@ -287,7 +287,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -412,7 +414,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': 'md5:eca57043abae25130f58f655ad9a7771', + 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'upload_date': '20140605', @@ -736,6 +738,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'format_id': format_id, 'url': video_url, 'width': int_or_none(r.attrib.get('width')), + 'height': int_or_none(r.attrib.get('height')), 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), 'asr': int_or_none(r.attrib.get('audioSamplingRate')), 'filesize': filesize, @@ -746,7 +749,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): fo for fo in formats if fo['format_id'] == format_id) except StopIteration: - f.update(self._formats.get(format_id, {})) + f.update(self._formats.get(format_id, {}).items()) formats.append(f) else: existing_format.update(f) @@ -1040,6 +1043,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) else: + # Hide the formats we found through non-DASH + dash_keys = set(df['format_id'] for df in dash_formats) + for f in formats: + if f['format_id'] in dash_keys: + f['format_id'] = 'nondash-%s' % f['format_id'] + f['preference'] = f.get('preference', 0) - 10000 formats.extend(dash_formats) self._sort_formats(formats) @@ -1199,9 +1208,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if playlist_id.startswith('RD'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - if playlist_id.startswith('TL'): - raise ExtractorError('For downloading YouTube.com top lists, use ' - 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) @@ -1247,49 +1253,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, playlist_title) -class YoutubeTopListIE(YoutubePlaylistIE): - IE_NAME = 'youtube:toplist' - IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' - ' (Example: "yttoplist:music:Top Tracks")') - _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' - _TESTS = [{ - 'url': 'yttoplist:music:Trending', - 'playlist_mincount': 5, - 'skip': 'Only works for logged-in users', - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel = mobj.group('chann') - title = mobj.group('title') - query = compat_urllib_parse.urlencode({'title': title}) - channel_page = self._download_webpage( - 'https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex( - r'''(?x) - <a\s+href="([^"]+)".*?>\s* - <span\s+class="branded-page-module-title-text">\s* - <span[^>]*>.*?%s.*?</span>''' % re.escape(query), - channel_page, 'list') - url = compat_urlparse.urljoin('https://www.youtube.com/', link) - - video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' - ids = [] - # sometimes the webpage doesn't contain the videos - # retry until we get them - for i in itertools.count(0): - msg = 'Downloading Youtube mix' - if i > 0: - msg += ', retry #%d' % i - - webpage = self._download_webpage(url, title, msg) - ids = orderedSet(re.findall(video_re, webpage)) - if ids: - break - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_title=title) - - class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' @@ -1701,3 +1664,20 @@ class YoutubeTruncatedURLIE(InfoExtractor): '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' ' or simply youtube-dl BaW_jenozKc .', expected=True) + + +class YoutubeTruncatedIDIE(InfoExtractor): + IE_NAME = 'youtube:truncated_id' + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + raise ExtractorError( + 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), + expected=True) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 74c76a9a0..98f15177b 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -119,7 +119,7 @@ class ZDFChannelIE(InfoExtractor): 'info_dict': { 'id': '1586442', }, - 'playlist_count': 4, + 'playlist_count': 3, } _PAGE_SIZE = 50 |