diff options
Diffstat (limited to 'youtube_dl')
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 2 | ||||
-rw-r--r-- | youtube_dl/downloader/external.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/empflix.py | 31 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 15 | ||||
-rw-r--r-- | youtube_dl/extractor/newstube.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/nrk.py | 14 | ||||
-rw-r--r-- | youtube_dl/extractor/planetaplay.py | 3 | ||||
-rw-r--r-- | youtube_dl/extractor/quickvid.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/smotri.py | 55 | ||||
-rw-r--r-- | youtube_dl/extractor/snagfilms.py | 171 | ||||
-rw-r--r-- | youtube_dl/extractor/tnaflix.py | 279 | ||||
-rw-r--r-- | youtube_dl/extractor/twitch.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/twitter.py | 72 | ||||
-rw-r--r-- | youtube_dl/extractor/vk.py | 25 | ||||
-rw-r--r-- | youtube_dl/extractor/vube.py | 1 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 95 | ||||
-rw-r--r-- | youtube_dl/options.py | 5 | ||||
-rw-r--r-- | youtube_dl/utils.py | 268 |
20 files changed, 918 insertions, 162 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ef0f71bad..411de9ac9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1008,7 +1008,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): - if 'width' in t and 'height' in t: + if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index a57c15856..1d5cc9904 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -131,5 +131,6 @@ def list_external_downloaders(): def get_external_downloader(external_downloader): """ Given the name of the executable, see whether we support the given downloader . """ - bn = os.path.basename(external_downloader) + # Drop .exe extension on Windows + bn = os.path.splitext(os.path.basename(external_downloader))[0] return _BY_NAME[bn] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 46cc4cd06..d44339200 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -144,7 +144,6 @@ from .ellentv import ( ) from .elpais import ElPaisIE from .embedly import EmbedlyIE -from .empflix import EMPFlixIE from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE @@ -493,6 +492,10 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) +from .snagfilms import ( + SnagFilmsIE, + SnagFilmsEmbedIE, +) from .snotr import SnotrIE from .sohu import SohuIE from .soompi import ( @@ -573,7 +576,11 @@ from .tmz import ( TMZIE, TMZArticleIE, ) -from .tnaflix import TNAFlixIE +from .tnaflix import ( + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) from .thvideo import ( THVideoIE, THVideoPlaylistIE @@ -617,6 +624,7 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) +from .twitter import TwitterCardIE from .ubu import UbuIE from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 49e4dc710..81623bfe3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,6 +22,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + NO_DEFAULT, age_restricted, bug_reports_message, clean_html, @@ -33,7 +34,6 @@ from ..utils import ( sanitize_filename, unescapeHTML, ) -_NO_DEFAULT = object() class InfoExtractor(object): @@ -523,7 +523,7 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -549,7 +549,7 @@ class InfoExtractor(object): return next(g for g in mobj.groups() if g is not None) else: return mobj.group(group) - elif default is not _NO_DEFAULT: + elif default is not NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) @@ -557,7 +557,7 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py deleted file mode 100644 index 4827022e0..000000000 --- a/youtube_dl/extractor/empflix.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -from .tnaflix import TNAFlixIE - - -class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' - - _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"' - _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' - _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - - _TESTS = [ - { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, - } - }, - { - 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', - 'only_matching': True, - } - ] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 42e4e7035..32e41d13e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,6 +47,7 @@ from .xhamster import XHamsterEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE +from .snagfilms import SnagFilmsEmbedIE class GenericIE(InfoExtractor): @@ -849,6 +850,15 @@ class GenericIE(InfoExtractor): 'uploader_id': 'clickhole', } }, + # SnagFilms embed + { + 'url': 'http://whilewewatch.blogspot.ru/2012/06/whilewewatch-whilewewatch-gripping.html', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, # AdobeTVVideo embed { 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', @@ -1550,6 +1560,11 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) + # Look for SnagFilms embeds + snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) + if snagfilms_url: + return self.url_result(snagfilms_url) + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index 85fcad06b..5a9e73cd6 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -31,7 +31,7 @@ class NewstubeIE(InfoExtractor): page = self._download_webpage(url, video_id, 'Downloading page') video_guid = self._html_search_regex( - r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'<meta property="og:video:url" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', page, 'video GUID') player = self._download_xml( diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index cc70c2950..9e4581cf9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -13,7 +13,7 @@ from ..utils import ( class NRKIE(InfoExtractor): - _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' + _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' _TESTS = [ { @@ -76,7 +76,7 @@ class NRKIE(InfoExtractor): class NRKPlaylistIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', @@ -116,11 +116,11 @@ class NRKPlaylistIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'(?P<baseurl>http://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' + _VALID_URL = r'(?P<baseurl>https?://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { - 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': 'adf2c5454fa2bf032f47a9f8fb351342', 'info_dict': { 'id': 'MUHH48000314', @@ -132,7 +132,7 @@ class NRKTVIE(InfoExtractor): }, }, { - 'url': 'http://tv.nrk.no/program/mdfp15000514', + 'url': 'https://tv.nrk.no/program/mdfp15000514', 'md5': '383650ece2b25ecec996ad7b5bb2a384', 'info_dict': { 'id': 'mdfp15000514', @@ -145,7 +145,7 @@ class NRKTVIE(InfoExtractor): }, { # single playlist video - 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', @@ -157,7 +157,7 @@ class NRKTVIE(InfoExtractor): 'skip': 'Only works from Norway', }, { - 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [ { 'md5': '9480285eff92d64f06e02a5367970a7a', diff --git a/youtube_dl/extractor/planetaplay.py b/youtube_dl/extractor/planetaplay.py index 596c621d7..06505e96f 100644 --- a/youtube_dl/extractor/planetaplay.py +++ b/youtube_dl/extractor/planetaplay.py @@ -18,7 +18,8 @@ class PlanetaPlayIE(InfoExtractor): 'id': '3586', 'ext': 'flv', 'title': 'md5:e829428ee28b1deed00de90de49d1da1', - } + }, + 'skip': 'Not accessible from Travis CI server', } _SONG_FORMATS = { diff --git a/youtube_dl/extractor/quickvid.py b/youtube_dl/extractor/quickvid.py index af7d76cf4..f414e2384 100644 --- a/youtube_dl/extractor/quickvid.py +++ b/youtube_dl/extractor/quickvid.py @@ -24,6 +24,7 @@ class QuickVidIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:png|jpg|gif)$', 'view_count': int, }, + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 24746a09a..93a7cfe15 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -53,7 +53,7 @@ class SmotriIE(InfoExtractor): 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', }, }, - # video-password + # video-password, not approved by moderator { 'url': 'http://smotri.com/video/view/?id=v1390466a13c', 'md5': 'f6331cef33cad65a0815ee482a54440b', @@ -71,7 +71,24 @@ class SmotriIE(InfoExtractor): }, 'skip': 'Video is not approved by moderator', }, - # age limit + video-password + # video-password + { + 'url': 'http://smotri.com/video/view/?id=v6984858774#', + 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', + 'info_dict': { + 'id': 'v6984858774', + 'ext': 'mp4', + 'title': 'Дача Солженицина ПАРОЛЬ 223322', + 'uploader': 'psavari1', + 'uploader_id': 'psavari1', + 'upload_date': '20081103', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'videopassword': '223322', + }, + }, + # age limit + video-password, not approved by moderator { 'url': 'http://smotri.com/video/view/?id=v15408898bcf', 'md5': '91e909c9f0521adf5ee86fbe073aad70', @@ -90,19 +107,22 @@ class SmotriIE(InfoExtractor): }, 'skip': 'Video is not approved by moderator', }, - # not approved by moderator, but available + # age limit + video-password { - 'url': 'http://smotri.com/video/view/?id=v28888533b73', - 'md5': 'f44bc7adac90af518ef1ecf04893bb34', + 'url': 'http://smotri.com/video/view/?id=v7780025814', + 'md5': 'b4599b068422559374a59300c5337d72', 'info_dict': { - 'id': 'v28888533b73', + 'id': 'v7780025814', 'ext': 'mp4', - 'title': 'Russian Spies Killed By ISIL Child Soldier', - 'uploader': 'Mopeder', - 'uploader_id': 'mopeder', - 'duration': 71, - 'thumbnail': 'http://frame9.loadup.ru/d7/32/2888853.2.3.jpg', - 'upload_date': '20150114', + 'title': 'Sexy Beach (пароль 123)', + 'uploader': 'вАся', + 'uploader_id': 'asya_prosto', + 'upload_date': '20081218', + 'thumbnail': 're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + 'params': { + 'videopassword': '123' }, }, # swf player @@ -152,6 +172,10 @@ class SmotriIE(InfoExtractor): 'getvideoinfo': '1', } + video_password = self._downloader.params.get('videopassword', None) + if video_password: + video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() + request = compat_urllib_request.Request( 'http://smotri.com/video/view/url/bot/', compat_urllib_parse.urlencode(video_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') @@ -161,13 +185,18 @@ class SmotriIE(InfoExtractor): video_url = video.get('_vidURL') or video.get('_vidURL_mp4') if not video_url: - if video.get('_moderate_no') or not video.get('moderated'): + if video.get('_moderate_no'): raise ExtractorError( 'Video %s has not been approved by moderator' % video_id, expected=True) if video.get('error'): raise ExtractorError('Video %s does not exist' % video_id, expected=True) + if video.get('_pass_protected') == 1: + msg = ('Invalid video password' if video_password + else 'This video is protected by a password, use the --video-password option') + raise ExtractorError(msg, expected=True) + title = video['title'] thumbnail = video['_imgURL'] upload_date = unified_strdate(video['added']) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py new file mode 100644 index 000000000..cf495f310 --- /dev/null +++ b/youtube_dl/extractor/snagfilms.py @@ -0,0 +1,171 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + js_to_json, + parse_duration, +) + + +class SnagFilmsEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' + _TESTS = [{ + 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', + 'md5': '2924e9215c6eff7a55ed35b72276bd93', + 'info_dict': { + 'id': '74849a00-85a9-11e1-9660-123139220831', + 'ext': 'mp4', + 'title': '#whilewewatch', + } + }, { + 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + if '>This film is not playable in your area.<' in webpage: + raise ExtractorError( + 'Film %s is not playable in your area.' % video_id, expected=True) + + formats = [] + for source in self._parse_json(js_to_json(self._search_regex( + r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): + file_ = source.get('file') + if not file_: + continue + type_ = source.get('type') + format_id = source.get('label') + ext = determine_ext(file_) + if any(_ == 'm3u8' for _ in (type_, ext)): + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', m3u8_id='hls')) + else: + bitrate = int_or_none(self._search_regex( + r'(\d+)kbps', file_, 'bitrate', default=None)) + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': file_, + 'format_id': format_id, + 'tbr': bitrate, + 'height': height, + }) + self._sort_formats(formats) + + title = self._search_regex( + [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'], + webpage, 'title') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } + + +class SnagFilmsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'http://www.snagfilms.com/films/title/lost_for_life', + 'md5': '19844f897b35af219773fd63bdec2942', + 'info_dict': { + 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017', + 'display_id': 'lost_for_life', + 'ext': 'mp4', + 'title': 'Lost for Life', + 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 4489, + 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals'] + } + }, { + 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india', + 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd', + 'info_dict': { + 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000', + 'display_id': 'the_world_cut_project/india', + 'ext': 'mp4', + 'title': 'India', + 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 979, + 'categories': ['Documentary', 'Sports', 'Politics'] + } + }, { + # Film is not playable in your area. + 'url': 'http://www.snagfilms.com/films/title/inside_mecca', + 'only_matching': True, + }, { + # Film is not available. + 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + if ">Sorry, the Film you're looking for is not available.<" in webpage: + raise ExtractorError( + 'Film %s is not available.' % display_id, expected=True) + + film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id') + + snag = self._parse_json( + self._search_regex( + 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'), + display_id) + + for item in snag: + if item.get('data', {}).get('film', {}).get('id') == film_id: + data = item['data']['film'] + title = data['title'] + description = clean_html(data.get('synopsis')) + thumbnail = data.get('image') + duration = int_or_none(data.get('duration') or data.get('runtime')) + categories = [ + category['title'] for category in data.get('categories', []) + if category.get('title')] + break + else: + title = self._search_regex( + r'itemprop="title">([^<]+)<', webpage, 'title') + description = self._html_search_regex( + r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>', + webpage, 'description', default=None) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = parse_duration(self._search_regex( + r'<span itemprop="duration" class="film-duration strong">([^<]+)<', + webpage, 'duration', fatal=False)) + categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage) + + return { + '_type': 'url_transparent', + 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, + 'id': film_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'categories': categories, + } diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index c282865b2..49516abca 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -3,39 +3,70 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - parse_duration, fix_xml_ampersands, + float_or_none, + int_or_none, + parse_duration, + str_to_int, + xpath_text, ) -class TNAFlixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' - - _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' - _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>' - _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - - _TESTS = [ - { - 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', - 'info_dict': { - 'id': '553878', - 'display_id': 'Carmella-Decesare-striptease', - 'ext': 'mp4', - 'title': 'Carmella Decesare - striptease', - 'description': '', - 'thumbnail': 're:https?://.*\.jpg$', - 'duration': 91, - 'age_limit': 18, - } - }, - { - 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', - 'only_matching': True, - } +class TNAFlixNetworkBaseIE(InfoExtractor): + # May be overridden in descendants if necessary + _CONFIG_REGEX = [ + r'flashvars\.config\s*=\s*escape\("([^"]+)"', + r'<input[^>]+name="config\d?" value="([^"]+)"', ] + _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' + _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' + _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' + _VIEW_COUNT_REGEX = None + _COMMENT_COUNT_REGEX = None + _AVERAGE_RATING_REGEX = None + _CATEGORIES_REGEX = r'<li[^>]*>\s*<span[^>]+class="infoTitle"[^>]*>Categories:</span>\s*<span[^>]+class="listView"[^>]*>(.+?)</span>\s*</li>' + + def _extract_thumbnails(self, flix_xml): + + def get_child(elem, names): + for name in names: + child = elem.find(name) + if child is not None: + return child + + timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage']) + if timeline is None: + return + + pattern_el = get_child(timeline, ['imagePattern', 'pattern']) + if pattern_el is None or not pattern_el.text: + return + + first_el = get_child(timeline, ['imageFirst', 'first']) + last_el = get_child(timeline, ['imageLast', 'last']) + if first_el is None or last_el is None: + return + + first_text = first_el.text + last_text = last_el.text + if not first_text.isdigit() or not last_text.isdigit(): + return + + first = int(first_text) + last = int(last_text) + if first > last: + return + + width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width')) + height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height')) + + return [{ + 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'), + 'width': width, + 'height': height, + } for i in range(first, last + 1)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -44,47 +75,195 @@ class TNAFlixIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) - description = self._html_search_regex( - self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='') - - age_limit = self._rta_search(webpage) - - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', default=None)) - cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') cfg_xml = self._download_xml( - cfg_url, display_id, note='Downloading metadata', + cfg_url, display_id, 'Downloading metadata', transform_source=fix_xml_ampersands) - thumbnail = self._proto_relative_url( - cfg_xml.find('./startThumb').text, 'http:') - formats = [] + + def extract_video_url(vl): + return re.sub('speed=\d+', 'speed=', vl.text) + + video_link = cfg_xml.find('./videoLink') + if video_link is not None: + formats.append({ + 'url': extract_video_url(video_link), + 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + }) + for item in cfg_xml.findall('./quality/item'): - video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text) - format_id = item.find('res').text - fmt = { - 'url': self._proto_relative_url(video_url, 'http:'), + video_link = item.find('./videoLink') + if video_link is None: + continue + res = item.find('res') + format_id = None if res is None else res.text + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) + 'height': height, + }) + self._sort_formats(formats) + thumbnail = self._proto_relative_url( + xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') + thumbnails = self._extract_thumbnails(cfg_xml) + + title = self._html_search_regex( + self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) + + age_limit = self._rta_search(webpage) + + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) + + def extract_field(pattern, name): + return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None + + description = extract_field(self._DESCRIPTION_REGEX, 'description') + uploader = extract_field(self._UPLOADER_REGEX, 'uploader') + view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) + comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) + average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) + + categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') + categories = categories_str.split(', ') if categories_str is not None else [] + return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'duration': duration, 'age_limit': age_limit, + 'uploader': uploader, + 'view_count': view_count, + 'comment_count': comment_count, + 'average_rating': average_rating, + 'categories': categories, 'formats': formats, } + + +class TNAFlixIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' + + _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' + _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>' + _UPLOADER_REGEX = r'(?s)<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)<div' + + _TESTS = [{ + # anonymous uploader, no categories + 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'info_dict': { + 'id': '553878', + 'display_id': 'Carmella-Decesare-striptease', + 'ext': 'mp4', + 'title': 'Carmella Decesare - striptease', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 91, + 'age_limit': 18, + 'uploader': 'Anonymous', + 'categories': [], + } + }, { + # non-anonymous uploader, categories + 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', + 'md5': '0f5d4d490dbfd117b8607054248a07c0', + 'info_dict': { + 'id': '6538', + 'display_id': 'Educational-xxx-video', + 'ext': 'mp4', + 'title': 'Educational xxx video', + 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 164, + 'age_limit': 18, + 'uploader': 'bobwhite39', + 'categories': ['Amateur Porn', 'Squirting Videos', 'Teen Girls 18+'], + } + }, { + 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', + 'only_matching': True, + }] + + +class EMPFlixIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html' + + _UPLOADER_REGEX = r'<span[^>]+class="infoTitle"[^>]*>Uploaded By:</span>(.+?)</li>' + + _TESTS = [{ + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 83, + 'age_limit': 18, + 'uploader': 'cwbike', + 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], + } + }, { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'only_matching': True, + }] + + +class MovieFapIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' + + _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>' + _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>' + _AVERAGE_RATING_REGEX = r'Current Rating\s*<br>\s*<strong>([\d.]+)</strong>' + _CATEGORIES_REGEX = r'(?s)<div[^>]+id="vid_info"[^>]*>\s*<div[^>]*>.+?</div>(.*?)<br>' + + _TESTS = [{ + # normal, multi-format video + 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', + 'md5': '26624b4e2523051b550067d547615906', + 'info_dict': { + 'id': 'be9867c9416c19f54a4a', + 'display_id': 'experienced-milf-amazing-handjob', + 'ext': 'mp4', + 'title': 'Experienced MILF Amazing Handjob', + 'description': 'Experienced MILF giving an Amazing Handjob', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + 'uploader': 'darvinfred06', + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'], + } + }, { + # quirky single-format case where the extension is given as fid, but the video is really an flv + 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', + 'md5': 'fa56683e291fc80635907168a743c9ad', + 'info_dict': { + 'id': 'e5da0d3edce5404418f5', + 'display_id': 'jeune-couple-russe', + 'ext': 'flv', + 'title': 'Jeune Couple Russe', + 'description': 'Amateur', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + 'uploader': 'whiskeyjar', + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['Amateur', 'Teen'], + } + }] diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 94bd6345d..b56ee2959 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -189,17 +189,17 @@ class TwitchVodIE(TwitchItemBaseIE): _ITEM_SHORTCUT = 'v' _TEST = { - 'url': 'http://www.twitch.tv/ksptv/v/3622000', + 'url': 'http://www.twitch.tv/riotgames/v/6528877', 'info_dict': { - 'id': 'v3622000', + 'id': 'v6528877', 'ext': 'mp4', - 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''', + 'title': 'LCK Summer Split - Week 6 Day 1', 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 6951, - 'timestamp': 1419028564, - 'upload_date': '20141219', - 'uploader': 'KSPTV', - 'uploader_id': 'ksptv', + 'duration': 17208, + 'timestamp': 1435131709, + 'upload_date': '20150624', + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', 'view_count': int, }, 'params': { @@ -215,7 +215,7 @@ class TwitchVodIE(TwitchItemBaseIE): '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( - '%s/vod/%s?nauth=%s&nauthsig=%s' + '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true' % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']), item_id, 'mp4') self._prefer_source(formats) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py new file mode 100644 index 000000000..1aaa06305 --- /dev/null +++ b/youtube_dl/extractor/twitter.py @@ -0,0 +1,72 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_request +from ..utils import ( + float_or_none, + unescapeHTML, +) + + +class TwitterCardIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' + _TEST = { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + 'md5': 'a74f50b310c83170319ba16de6955192', + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 30.033, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Different formats served for different User-Agents + USER_AGENTS = [ + 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)', # mp4 + 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0', # webm + ] + + config = None + formats = [] + for user_agent in USER_AGENTS: + request = compat_urllib_request.Request(url) + request.add_header('User-Agent', user_agent) + webpage = self._download_webpage(request, video_id) + + config = self._parse_json( + unescapeHTML(self._search_regex( + r'data-player-config="([^"]+)"', webpage, 'data player config')), + video_id) + + video_url = config['playlist'][0]['source'] + + f = { + 'url': video_url, + } + + m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + formats.append(f) + self._sort_formats(formats) + + thumbnail = config.get('posterImageUrl') + duration = float_or_none(config.get('duration')) + + return { + 'id': video_id, + 'title': 'TwitterCard', + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 38ff3c1a9..f2ae109f9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -121,20 +121,27 @@ class VKIE(InfoExtractor): if username is None: return - login_form = { - 'act': 'login', - 'role': 'al_frame', - 'expire': '1', + login_page = self._download_webpage( + 'https://vk.com', None, 'Downloading login page') + + login_form = dict(re.findall( + r'<input\s+type="hidden"\s+name="([^"]+)"\s+(?:id="[^"]+"\s+)?value="([^"]*)"', + login_page)) + + login_form.update({ 'email': username.encode('cp1251'), 'pass': password.encode('cp1251'), - } + }) - request = compat_urllib_request.Request('https://login.vk.com/?act=login', - compat_urllib_parse.urlencode(login_form).encode('utf-8')) - login_page = self._download_webpage(request, None, note='Logging in as %s' % username) + request = compat_urllib_request.Request( + 'https://login.vk.com/?act=login', + compat_urllib_parse.urlencode(login_form).encode('utf-8')) + login_page = self._download_webpage( + request, None, note='Logging in as %s' % username) if re.search(r'onLoginFailed', login_page): - raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) + raise ExtractorError( + 'Unable to login, incorrect username and/or password', expected=True) def _real_initialize(self): self._login() diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 405cb9db4..149e36467 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -36,6 +36,7 @@ class VubeIE(InfoExtractor): 'comment_count': int, 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], }, + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', 'md5': 'db7aba89d4603dadd627e9d1973946fe', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c9d8e5125..c28ca9319 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,9 +29,11 @@ from ..utils import ( get_element_by_id, int_or_none, orderedSet, + str_to_int, unescapeHTML, unified_strdate, uppercase_escape, + ISO3166Utils, ) @@ -518,6 +520,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': 'requires avconv', } }, + # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'mp4', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:33', + }, + } ] def __init__(self, *args, **kwargs): @@ -859,6 +875,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_url = None + dash_mpds = [] + + def add_dash_mpd(video_info): + dash_mpd = video_info.get('dashmpd') + if dash_mpd and dash_mpd[0] not in dash_mpds: + dash_mpds.append(dash_mpd[0]) + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -879,24 +902,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note='Refetching age-gated info webpage', errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(video_info) else: age_gate = False - try: - # Try looking directly into the video webpage - mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) - if not mobj: - raise ValueError('Could not find ytplayer.config') # caught below + video_info = None + # Try looking directly into the video webpage + mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) + if mobj: json_code = uppercase_escape(mobj.group(1)) ytplayer_config = json.loads(json_code) args = ytplayer_config['args'] - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - if not args.get('url_encoded_fmt_stream_map'): - raise ValueError('No stream_map present') # caught below - except ValueError: - # We fallback to the get_video_info pages (used by the embed page) + if args.get('url_encoded_fmt_stream_map'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: + for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: video_info_url = ( '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (proto, video_id, el_type)) @@ -904,11 +932,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_info_url, video_id, note=False, errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: + get_video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(get_video_info) + if not video_info: + video_info = get_video_info + if 'token' in get_video_info: break if 'token' not in video_info: if 'reason' in video_info: + if 'The uploader has not made this video available in your country.' in video_info['reason']: + regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) + if regions_allowed is not None: + raise ExtractorError('YouTube said: This video is available in %s only' % ( + ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), + expected=True) raise ExtractorError( 'YouTube said: %s' % video_info['reason'][0], expected=True, video_id=video_id) @@ -1004,12 +1041,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_description = '' def _extract_count(count_name): - count = self._search_regex( - r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name), - video_webpage, count_name, default=None) - if count is not None: - return int(count.replace(',', '')) - return None + return str_to_int(self._search_regex( + r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' + % re.escape(count_name), + video_webpage, count_name, default=None)) + like_count = _extract_count('like') dislike_count = _extract_count('dislike') @@ -1124,24 +1160,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd = video_info.get('dashmpd') - if dash_mpd: - dash_manifest_url = dash_mpd[0] + for dash_manifest_url in dash_mpds: + dash_formats = {} try: - dash_formats = self._parse_dash_manifest( - video_id, dash_manifest_url, player_url, age_gate) + for df in self._parse_dash_manifest( + video_id, dash_manifest_url, player_url, age_gate): + # Do not overwrite DASH format found in some previous DASH manifest + if df['format_id'] not in dash_formats: + dash_formats[df['format_id']] = df except (ExtractorError, KeyError) as e: self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) - else: + if dash_formats: # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See # https://github.com/rg3/youtube-dl/issues/5774 for an # example. - dash_keys = set(df['format_id'] for df in dash_formats) - formats = [f for f in formats if f['format_id'] not in dash_keys] - formats.extend(dash_formats) + formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] + formats.extend(dash_formats.values()) # Check for malformed aspect ratio stretched_m = re.search( diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6aeca61ee..4762e1e3c 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -346,12 +346,13 @@ def parseOpts(overrideArguments=None): video_format.add_option( '--youtube-skip-dash-manifest', action='store_false', dest='youtube_include_dash_manifest', - help='Do not download the DASH manifest on YouTube videos') + help='Do not download the DASH manifests and related data on YouTube videos') video_format.add_option( '--merge-output-format', action='store', dest='merge_output_format', metavar='FORMAT', default=None, help=( - 'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.' + 'If a merge is required (e.g. bestvideo+bestaudio), ' + 'output to given container format. One of mkv, mp4, ogg, webm, flv. ' 'Ignored if no merge is required')) subtitles = optparse.OptionGroup(parser, 'Subtitle Options') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a2746b2d1..942f76d24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -62,6 +62,8 @@ std_headers = { } +NO_DEFAULT = object() + ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] @@ -171,13 +173,15 @@ def xpath_with_ns(path, ns_map): return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False): +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): if sys.version_info < (2, 7): # Crazy 2.6 xpath = xpath.encode('ascii') n = node.find(xpath) if n is None or n.text is None: - if fatal: + if default is not NO_DEFAULT: + return default + elif fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) else: @@ -2084,6 +2088,266 @@ class ISO639Utils(object): return short_name +class ISO3166Utils(object): + # From http://data.okfn.org/data/core/country-list + _country_map = { + 'AF': 'Afghanistan', + 'AX': 'Åland Islands', + 'AL': 'Albania', + 'DZ': 'Algeria', + 'AS': 'American Samoa', + 'AD': 'Andorra', + 'AO': 'Angola', + 'AI': 'Anguilla', + 'AQ': 'Antarctica', + 'AG': 'Antigua and Barbuda', + 'AR': 'Argentina', + 'AM': 'Armenia', + 'AW': 'Aruba', + 'AU': 'Australia', + 'AT': 'Austria', + 'AZ': 'Azerbaijan', + 'BS': 'Bahamas', + 'BH': 'Bahrain', + 'BD': 'Bangladesh', + 'BB': 'Barbados', + 'BY': 'Belarus', + 'BE': 'Belgium', + 'BZ': 'Belize', + 'BJ': 'Benin', + 'BM': 'Bermuda', + 'BT': 'Bhutan', + 'BO': 'Bolivia, Plurinational State of', + 'BQ': 'Bonaire, Sint Eustatius and Saba', + 'BA': 'Bosnia and Herzegovina', + 'BW': 'Botswana', + 'BV': 'Bouvet Island', + 'BR': 'Brazil', + 'IO': 'British Indian Ocean Territory', + 'BN': 'Brunei Darussalam', + 'BG': 'Bulgaria', + 'BF': 'Burkina Faso', + 'BI': 'Burundi', + 'KH': 'Cambodia', + 'CM': 'Cameroon', + 'CA': 'Canada', + 'CV': 'Cape Verde', + 'KY': 'Cayman Islands', + 'CF': 'Central African Republic', + 'TD': 'Chad', + 'CL': 'Chile', + 'CN': 'China', + 'CX': 'Christmas Island', + 'CC': 'Cocos (Keeling) Islands', + 'CO': 'Colombia', + 'KM': 'Comoros', + 'CG': 'Congo', + 'CD': 'Congo, the Democratic Republic of the', + 'CK': 'Cook Islands', + 'CR': 'Costa Rica', + 'CI': 'Côte d\'Ivoire', + 'HR': 'Croatia', + 'CU': 'Cuba', + 'CW': 'Curaçao', + 'CY': 'Cyprus', + 'CZ': 'Czech Republic', + 'DK': 'Denmark', + 'DJ': 'Djibouti', + 'DM': 'Dominica', + 'DO': 'Dominican Republic', + 'EC': 'Ecuador', + 'EG': 'Egypt', + 'SV': 'El Salvador', + 'GQ': 'Equatorial Guinea', + 'ER': 'Eritrea', + 'EE': 'Estonia', + 'ET': 'Ethiopia', + 'FK': 'Falkland Islands (Malvinas)', + 'FO': 'Faroe Islands', + 'FJ': 'Fiji', + 'FI': 'Finland', + 'FR': 'France', + 'GF': 'French Guiana', + 'PF': 'French Polynesia', + 'TF': 'French Southern Territories', + 'GA': 'Gabon', + 'GM': 'Gambia', + 'GE': 'Georgia', + 'DE': 'Germany', + 'GH': 'Ghana', + 'GI': 'Gibraltar', + 'GR': 'Greece', + 'GL': 'Greenland', + 'GD': 'Grenada', + 'GP': 'Guadeloupe', + 'GU': 'Guam', + 'GT': 'Guatemala', + 'GG': 'Guernsey', + 'GN': 'Guinea', + 'GW': 'Guinea-Bissau', + 'GY': 'Guyana', + 'HT': 'Haiti', + 'HM': 'Heard Island and McDonald Islands', + 'VA': 'Holy See (Vatican City State)', + 'HN': 'Honduras', + 'HK': 'Hong Kong', + 'HU': 'Hungary', + 'IS': 'Iceland', + 'IN': 'India', + 'ID': 'Indonesia', + 'IR': 'Iran, Islamic Republic of', + 'IQ': 'Iraq', + 'IE': 'Ireland', + 'IM': 'Isle of Man', + 'IL': 'Israel', + 'IT': 'Italy', + 'JM': 'Jamaica', + 'JP': 'Japan', + 'JE': 'Jersey', + 'JO': 'Jordan', + 'KZ': 'Kazakhstan', + 'KE': 'Kenya', + 'KI': 'Kiribati', + 'KP': 'Korea, Democratic People\'s Republic of', + 'KR': 'Korea, Republic of', + 'KW': 'Kuwait', + 'KG': 'Kyrgyzstan', + 'LA': 'Lao People\'s Democratic Republic', + 'LV': 'Latvia', + 'LB': 'Lebanon', + 'LS': 'Lesotho', + 'LR': 'Liberia', + 'LY': 'Libya', + 'LI': 'Liechtenstein', + 'LT': 'Lithuania', + 'LU': 'Luxembourg', + 'MO': 'Macao', + 'MK': 'Macedonia, the Former Yugoslav Republic of', + 'MG': 'Madagascar', + 'MW': 'Malawi', + 'MY': 'Malaysia', + 'MV': 'Maldives', + 'ML': 'Mali', + 'MT': 'Malta', + 'MH': 'Marshall Islands', + 'MQ': 'Martinique', + 'MR': 'Mauritania', + 'MU': 'Mauritius', + 'YT': 'Mayotte', + 'MX': 'Mexico', + 'FM': 'Micronesia, Federated States of', + 'MD': 'Moldova, Republic of', + 'MC': 'Monaco', + 'MN': 'Mongolia', + 'ME': 'Montenegro', + 'MS': 'Montserrat', + 'MA': 'Morocco', + 'MZ': 'Mozambique', + 'MM': 'Myanmar', + 'NA': 'Namibia', + 'NR': 'Nauru', + 'NP': 'Nepal', + 'NL': 'Netherlands', + 'NC': 'New Caledonia', + 'NZ': 'New Zealand', + 'NI': 'Nicaragua', + 'NE': 'Niger', + 'NG': 'Nigeria', + 'NU': 'Niue', + 'NF': 'Norfolk Island', + 'MP': 'Northern Mariana Islands', + 'NO': 'Norway', + 'OM': 'Oman', + 'PK': 'Pakistan', + 'PW': 'Palau', + 'PS': 'Palestine, State of', + 'PA': 'Panama', + 'PG': 'Papua New Guinea', + 'PY': 'Paraguay', + 'PE': 'Peru', + 'PH': 'Philippines', + 'PN': 'Pitcairn', + 'PL': 'Poland', + 'PT': 'Portugal', + 'PR': 'Puerto Rico', + 'QA': 'Qatar', + 'RE': 'Réunion', + 'RO': 'Romania', + 'RU': 'Russian Federation', + 'RW': 'Rwanda', + 'BL': 'Saint Barthélemy', + 'SH': 'Saint Helena, Ascension and Tristan da Cunha', + 'KN': 'Saint Kitts and Nevis', + 'LC': 'Saint Lucia', + 'MF': 'Saint Martin (French part)', + 'PM': 'Saint Pierre and Miquelon', + 'VC': 'Saint Vincent and the Grenadines', + 'WS': 'Samoa', + 'SM': 'San Marino', + 'ST': 'Sao Tome and Principe', + 'SA': 'Saudi Arabia', + 'SN': 'Senegal', + 'RS': 'Serbia', + 'SC': 'Seychelles', + 'SL': 'Sierra Leone', + 'SG': 'Singapore', + 'SX': 'Sint Maarten (Dutch part)', + 'SK': 'Slovakia', + 'SI': 'Slovenia', + 'SB': 'Solomon Islands', + 'SO': 'Somalia', + 'ZA': 'South Africa', + 'GS': 'South Georgia and the South Sandwich Islands', + 'SS': 'South Sudan', + 'ES': 'Spain', + 'LK': 'Sri Lanka', + 'SD': 'Sudan', + 'SR': 'Suriname', + 'SJ': 'Svalbard and Jan Mayen', + 'SZ': 'Swaziland', + 'SE': 'Sweden', + 'CH': 'Switzerland', + 'SY': 'Syrian Arab Republic', + 'TW': 'Taiwan, Province of China', + 'TJ': 'Tajikistan', + 'TZ': 'Tanzania, United Republic of', + 'TH': 'Thailand', + 'TL': 'Timor-Leste', + 'TG': 'Togo', + 'TK': 'Tokelau', + 'TO': 'Tonga', + 'TT': 'Trinidad and Tobago', + 'TN': 'Tunisia', + 'TR': 'Turkey', + 'TM': 'Turkmenistan', + 'TC': 'Turks and Caicos Islands', + 'TV': 'Tuvalu', + 'UG': 'Uganda', + 'UA': 'Ukraine', + 'AE': 'United Arab Emirates', + 'GB': 'United Kingdom', + 'US': 'United States', + 'UM': 'United States Minor Outlying Islands', + 'UY': 'Uruguay', + 'UZ': 'Uzbekistan', + 'VU': 'Vanuatu', + 'VE': 'Venezuela, Bolivarian Republic of', + 'VN': 'Viet Nam', + 'VG': 'Virgin Islands, British', + 'VI': 'Virgin Islands, U.S.', + 'WF': 'Wallis and Futuna', + 'EH': 'Western Sahara', + 'YE': 'Yemen', + 'ZM': 'Zambia', + 'ZW': 'Zimbabwe', + } + + @classmethod + def short2full(cls, code): + """Convert an ISO 3166-2 country code to the corresponding full name""" + return cls._country_map.get(code.upper()) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers |