diff options
Diffstat (limited to 'youtube_dl')
-rwxr-xr-x | youtube_dl/YoutubeDL.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/brightcove.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/comedycentral.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 38 | ||||
-rw-r--r-- | youtube_dl/extractor/generic.py | 79 | ||||
-rw-r--r-- | youtube_dl/extractor/miomio.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 72 | ||||
-rw-r--r-- | youtube_dl/extractor/mtv.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/pladform.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/qqmusic.py | 170 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/spike.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/srf.py | 104 | ||||
-rw-r--r-- | youtube_dl/extractor/teamcoco.py | 31 | ||||
-rw-r--r-- | youtube_dl/extractor/tumblr.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/udn.py | 18 | ||||
-rw-r--r-- | youtube_dl/extractor/vimple.py | 71 | ||||
-rw-r--r-- | youtube_dl/postprocessor/atomicparsley.py | 9 | ||||
-rw-r--r-- | youtube_dl/postprocessor/ffmpeg.py | 15 | ||||
-rw-r--r-- | youtube_dl/utils.py | 33 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
22 files changed, 525 insertions, 188 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a68b24ab4..6ac85f4e7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1486,16 +1486,9 @@ class YoutubeDL(object): pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: - keep_video = None old_filename = info['filepath'] try: - keep_video_wish, info = pp.run(info) - if keep_video_wish is not None: - if keep_video_wish: - keep_video = keep_video_wish - elif keep_video is None: - # No clear decision yet, let IE decide - keep_video = keep_video_wish + keep_video, info = pp.run(info) except PostProcessingError as e: self.report_error(e.msg) if keep_video is False and not self.params.get('keepvideo', False): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d32f1cbd2..9e9e20589 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -397,6 +397,11 @@ from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, +) from .quickvid import QuickVidIE from .r7 import R7IE from .radiode import RadioDeIE @@ -481,6 +486,7 @@ from .spike import SpikeIE from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .srf import SrfIE from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0733bece7..4f60d5366 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -117,7 +117,10 @@ class BrightcoveIE(InfoExtractor): object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) object_str = fix_xml_ampersands(object_str) - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + try: + object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + except xml.etree.ElementTree.ParseError: + return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: @@ -183,9 +186,9 @@ class BrightcoveIE(InfoExtractor): (?: [^>]+?class=[\'"][^>]*?BrightcoveExperience.*?[\'"] | [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ - ).+?</object>''', + ).+?>\s*</object>''', webpage) - return [cls._build_brighcove_url(m) for m in matches] + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index e5edcc84b..91ebb0ce5 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -201,7 +201,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): uri = mMovieParams[0][1] # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri) + uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) idoc = self._download_xml( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ed97f8dd..7757bf950 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -23,6 +23,7 @@ from ..compat import ( ) from ..utils import ( age_restricted, + bug_reports_message, clean_html, compiled_regex_type, ExtractorError, @@ -324,7 +325,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -334,14 +335,11 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -354,6 +352,16 @@ class InfoExtractor(object): encoding = 'utf-16' else: encoding = 'utf-8' + + return encoding + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes + if not encoding: + encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -410,13 +418,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -431,10 +439,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True): + transform_source=None, fatal=True, encoding=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) if xml_string is False: return xml_string if transform_source: @@ -445,9 +453,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True): + fatal=True, encoding=None): json_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding) if (not fatal) and json_string is False: return None return self._parse_json( @@ -548,8 +557,7 @@ class InfoExtractor(object): elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7ad555e9f..e645d1bb3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -615,13 +615,24 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '100183293', 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', + 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 694, 'age_limit': 0, }, }, + # Playwire embed + { + 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html', + 'info_dict': { + 'id': '3519514', + 'ext': 'mp4', + 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', + 'thumbnail': 're:^https?://.*\.png$', + 'duration': 45.115, + }, + }, # 5min embed { 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', @@ -681,13 +692,41 @@ class GenericIE(InfoExtractor): # UDN embed { 'url': 'http://www.udn.com/news/story/7314/822787', - 'md5': 'de06b4c90b042c128395a88f0384817e', + 'md5': 'fd2060e988c326991037b9aff9df21a6', 'info_dict': { - 'id': '300040', + 'id': '300346', 'ext': 'mp4', - 'title': '生物老師男變女 全校挺"做自己"', + 'title': '中一中男師變性 全校師生力挺', 'thumbnail': 're:^https?://.*\.jpg$', } + }, + # Ooyala embed + { + 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T', + 'info_dict': { + 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', + 'ext': 'mp4', + 'description': 'VIDEO: Index/Match versus VLOOKUP.', + 'title': 'This is what separates the Excel masters from the wannabes', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, + # Contains a SMIL manifest + { + 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html', + 'info_dict': { + 'id': 'file', + 'ext': 'flv', + 'title': '+ Football: Lottery Champions League Europe', + 'uploader': 'www.telewebion.com', + }, + 'params': { + # rtmpe downloads + 'skip_download': True, + } } ] @@ -1092,7 +1131,8 @@ class GenericIE(InfoExtractor): # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or - re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage)) + re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or + re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: return OoyalaIE._build_url_result(mobj.group('ec')) @@ -1295,6 +1335,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Pladform') + # Look for Playwire embeds + mobj = re.search( + r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for 5min embeds mobj = re.search( r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) @@ -1408,13 +1454,22 @@ class GenericIE(InfoExtractor): # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] - entries.append({ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, - }) + if determine_ext(video_url) == 'smil': + entries.append({ + 'id': video_id, + 'formats': self._extract_smil_formats(video_url, video_id), + 'uploader': video_uploader, + 'title': video_title, + 'age_limit': age_limit, + }) + else: + entries.append({ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'title': video_title, + 'age_limit': age_limit, + }) if len(entries) == 1: return entries[0] diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index cc3f27194..d41195a96 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( xpath_text, int_or_none, + ExtractorError, ) @@ -14,13 +15,14 @@ class MioMioIE(InfoExtractor): IE_NAME = 'miomio.tv' _VALID_URL = r'https?://(?:www\.)?miomio\.tv/watch/cc(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.miomio.tv/watch/cc179734/', - 'md5': '48de02137d0739c15b440a224ad364b9', + # "type=video" in flashvars + 'url': 'http://www.miomio.tv/watch/cc88912/', + 'md5': '317a5f7f6b544ce8419b784ca8edae65', 'info_dict': { - 'id': '179734', + 'id': '88912', 'ext': 'flv', - 'title': '手绘动漫鬼泣但丁全程画法', - 'duration': 354, + 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', + 'duration': 5923, }, }, { 'url': 'http://www.miomio.tv/watch/cc184024/', @@ -42,7 +44,7 @@ class MioMioIE(InfoExtractor): r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') xml_config = self._search_regex( - r'flashvars="type=sina&(.+?)&', + r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') # skipping the following page causes lags and eventually connection drop-outs @@ -59,6 +61,9 @@ class MioMioIE(InfoExtractor): 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, } + if not int_or_none(xpath_text(vid_config, 'timelength')): + raise ExtractorError('Unable to load videos!', expected=True) + entries = [] for f in vid_config.findall('./durl'): segment_url = xpath_text(f, 'url', 'video url') diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 84f291558..425a4ccf1 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..compat import ( @@ -46,20 +45,16 @@ class MixcloudIE(InfoExtractor): }, }] - def _get_url(self, track_id, template_url, server_number): - boundaries = (1, 30) - for nr in server_numbers(server_number, boundaries): - url = template_url % nr - try: - # We only want to know if the request succeed - # don't download the whole file - self._request_webpage( - HEADRequest(url), track_id, - 'Checking URL %d/%d ...' % (nr, boundaries[-1])) - return url - except ExtractorError: - pass - return None + def _check_url(self, url, track_id, ext): + try: + # We only want to know if the request succeed + # don't download the whole file + self._request_webpage( + HEADRequest(url), track_id, + 'Trying %s URL' % ext) + return True + except ExtractorError: + return False def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -72,15 +67,10 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') song_url = preview_url.replace('/previews/', '/c/originals/') - server_number = int(self._search_regex(r'stream(\d+)', song_url, 'server number')) - template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self._get_url(track_id, template_url, server_number) - if final_song_url is None: - self.to_screen('Trying with m4a extension') - template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - final_song_url = self._get_url(track_id, template_url, server_number) - if final_song_url is None: - raise ExtractorError('Unable to extract track url') + if not self._check_url(song_url, track_id, 'mp3'): + song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') + if not self._check_url(song_url, track_id, 'm4a'): + raise ExtractorError('Unable to extract track url') PREFIX = ( r'm-play-on-spacebar[^>]+' @@ -107,7 +97,7 @@ class MixcloudIE(InfoExtractor): return { 'id': track_id, 'title': title, - 'url': final_song_url, + 'url': song_url, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, @@ -115,35 +105,3 @@ class MixcloudIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, } - - -def server_numbers(first, boundaries): - """ Server numbers to try in descending order of probable availability. - Starting from first (i.e. the number of the server hosting the preview file) - and going further and further up to the higher boundary and down to the - lower one in an alternating fashion. Namely: - - server_numbers(2, (1, 5)) - - # Where the preview server is 2, min number is 1 and max is 5. - # Yields: 2, 3, 1, 4, 5 - - Why not random numbers or increasing sequences? Since from what I've seen, - full length files seem to be hosted on servers whose number is closer to - that of the preview; to be confirmed. - """ - zip_longest = getattr(itertools, 'zip_longest', None) - if zip_longest is None: - # python 2.x - zip_longest = itertools.izip_longest - - if len(boundaries) != 2: - raise ValueError("boundaries should be a two-element tuple") - min, max = boundaries - highs = range(first + 1, max + 1) - lows = range(first - 1, min - 1, -1) - rest = filter( - None, itertools.chain.from_iterable(zip_longest(highs, lows))) - yield first - for n in rest: - yield n diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index c11de1cb6..4430b3416 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -118,6 +118,14 @@ class MTVServicesInfoExtractor(InfoExtractor): mediagen_doc = self._download_xml(mediagen_url, video_id, 'Downloading video urls') + item = mediagen_doc.find('./video/item') + if item is not None and item.get('type') == 'text': + message = '%s returned error: ' % self.IE_NAME + if item.get('code') is not None: + message += '%s - ' % item.get('code') + message += item.text + raise ExtractorError(message, expected=True) + description_node = itemdoc.find('description') if description_node is not None: description = description_node.text.strip() diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index abde34b94..551c8c9f0 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -30,7 +30,7 @@ class PladformIE(InfoExtractor): 'info_dict': { 'id': '100183293', 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть', + 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 694, diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py new file mode 100644 index 000000000..174c8e0ae --- /dev/null +++ b/youtube_dl/extractor/qqmusic.py @@ -0,0 +1,170 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import time +import re + +from .common import InfoExtractor +from ..utils import ( + strip_jsonp, + unescapeHTML, +) +from ..compat import compat_urllib_request + + +class QQMusicIE(InfoExtractor): + _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _TESTS = [{ + 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', + 'md5': 'bed90b6db2a7a7a7e11bc585f471f63a', + 'info_dict': { + 'id': '004295Et37taLD', + 'ext': 'm4a', + 'title': '可惜没如果', + 'upload_date': '20141227', + 'creator': '林俊杰', + 'description': 'md5:4348ff1dd24036906baa7b6f973f8d30', + } + }] + + # Reference: m_r_GetRUin() in top_player.js + # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js + @staticmethod + def m_r_get_ruin(): + curMs = int(time.time() * 1000) % 1000 + return int(round(random.random() * 2147483647) * curMs % 1E10) + + def _real_extract(self, url): + mid = self._match_id(url) + + detail_info_page = self._download_webpage( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_yqq_song_detail_info.fcg?songmid=%s&play=0' % mid, + mid, note='Download song detail info', + errnote='Unable to get song detail info', encoding='gbk') + + song_name = self._html_search_regex( + r"songname:\s*'([^']+)'", detail_info_page, 'song name') + + publish_time = self._html_search_regex( + r'发行时间:(\d{4}-\d{2}-\d{2})', detail_info_page, + 'publish time', default=None) + if publish_time: + publish_time = publish_time.replace('-', '') + + singer = self._html_search_regex( + r"singer:\s*'([^']+)", detail_info_page, 'singer', default=None) + + lrc_content = self._html_search_regex( + r'<div class="content" id="lrc_content"[^<>]*>([^<>]+)</div>', + detail_info_page, 'LRC lyrics', default=None) + + guid = self.m_r_get_ruin() + + vkey = self._download_json( + 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, + mid, note='Retrieve vkey', errnote='Unable to get vkey', + transform_source=strip_jsonp)['key'] + song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid) + + return { + 'id': mid, + 'url': song_url, + 'title': song_name, + 'upload_date': publish_time, + 'creator': singer, + 'description': lrc_content, + } + + +class QQPlaylistBaseIE(InfoExtractor): + @staticmethod + def qq_static_url(category, mid): + return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid) + + @classmethod + def get_entries_from_page(cls, page): + entries = [] + + for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page): + song_mid = unescapeHTML(item).split('|')[-5] + entries.append(cls.url_result( + 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', + song_mid)) + + return entries + + +class QQMusicSingerIE(QQPlaylistBaseIE): + _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _TEST = { + 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', + 'info_dict': { + 'id': '001BLpXF2DyJe2', + 'title': '林俊杰', + 'description': 'md5:2a222d89ba4455a3af19940c0481bb78', + }, + 'playlist_count': 12, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + singer_page = self._download_webpage( + self.qq_static_url('singer', mid), mid, 'Download singer page') + + entries = self.get_entries_from_page(singer_page) + + singer_name = self._html_search_regex( + r"singername\s*:\s*'([^']+)'", singer_page, 'singer name', + default=None) + + singer_id = self._html_search_regex( + r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id', + default=None) + + singer_desc = None + + if singer_id: + req = compat_urllib_request.Request( + 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id) + req.add_header( + 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html') + singer_desc_page = self._download_xml( + req, mid, 'Donwload singer description XML') + + singer_desc = singer_desc_page.find('./data/info/desc').text + + return self.playlist_result(entries, mid, singer_name, singer_desc) + + +class QQMusicAlbumIE(QQPlaylistBaseIE): + _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + + _TEST = { + 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1&play=0', + 'info_dict': { + 'id': '000gXCTb2AhRR1', + 'title': '我们都是这样长大的', + 'description': 'md5:d216c55a2d4b3537fe4415b8767d74d6', + }, + 'playlist_count': 4, + } + + def _real_extract(self, url): + mid = self._match_id(url) + + album_page = self._download_webpage( + self.qq_static_url('album', mid), mid, 'Download album page') + + entries = self.get_entries_from_page(album_page) + + album_name = self._html_search_regex( + r"albumname\s*:\s*'([^']+)',", album_page, 'album name', + default=None) + + album_detail = self._html_search_regex( + r'<div class="album_detail close_detail">\s*<p>((?:[^<>]+(?:<br />)?)+)</p>', + album_page, 'album details', default=None) + + return self.playlist_result(entries, mid, album_name, album_detail) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 316b2c90f..183ff50f4 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -221,7 +221,12 @@ class SoundcloudIE(InfoExtractor): info_json_url += "&secret_token=" + token elif mobj.group('player'): query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - return self.url_result(query['url'][0]) + real_url = query['url'][0] + # If the token is in the query of the original url we have to + # manually add it + if 'secret_token' in query: + real_url += '?secret_token=' + query['secret_token'][0] + return self.url_result(real_url) else: # extract uploader (which is in the url) uploader = mobj.group('uploader') @@ -274,9 +279,8 @@ class SoundcloudSetIE(SoundcloudIE): info = self._download_json(resolv_url, full_title) if 'errors' in info: - for err in info['errors']: - self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message'])) - return + msgs = (compat_str(err['error_message']) for err in info['errors']) + raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index e529bb55c..182f286df 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -5,7 +5,7 @@ from .mtv import MTVServicesInfoExtractor class SpikeIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?:// - (?:www\.spike\.com/(?:video-clips|(?:full-)?episodes)/.+| + (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+| m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+)) ''' _TEST = { diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py new file mode 100644 index 000000000..77eec0bc7 --- /dev/null +++ b/youtube_dl/extractor/srf.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + xpath_text, +) + + +class SrfIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/tv/[^/]+/video/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})' + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': '4cd93523723beff51bb4bee974ee238d', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'display_id': 'snowden-beantragt-asyl-in-russland', + 'ext': 'm4v', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372713995, + } + }, { + # No Speichern (Save) button + 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', + 'md5': 'd97e236e80d1d24729e5d0953d276a4f', + 'info_dict': { + 'id': '677f5829-e473-4823-ac83-a1087fe97faa', + 'display_id': 'jaguar-xk120-shadow-und-tornado-dampflokomotive', + 'ext': 'flv', + 'upload_date': '20130710', + 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', + 'timestamp': 1373493600, + }, + }, { + 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }, { + 'url': 'https://tp.srgssr.ch/p/flash?urn=urn:srf:ais:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + display_id = re.match(self._VALID_URL, url).group('display_id') or video_id + + video_data = self._download_xml( + 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/video/play/%s.xml' % video_id, + display_id) + + title = xpath_text( + video_data, './AssetMetadatas/AssetMetadata/title', fatal=True) + thumbnails = [{ + 'url': s.text + } for s in video_data.findall('.//ImageRepresentation/url')] + timestamp = parse_iso8601(xpath_text(video_data, './createdDate')) + # The <duration> field in XML is different from the exact duration, skipping + + formats = [] + for item in video_data.findall('./Playlists/Playlist') + video_data.findall('./Downloads/Download'): + for url_node in item.findall('url'): + quality = url_node.attrib['quality'] + full_url = url_node.text + original_ext = determine_ext(full_url) + format_id = '%s-%s' % (quality, item.attrib['protocol']) + if original_ext == 'f4m': + formats.extend(self._extract_f4m_formats( + full_url + '?hdcore=3.4.0', display_id, f4m_id=format_id)) + elif original_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + full_url, display_id, 'mp4', m3u8_id=format_id)) + else: + formats.append({ + 'url': full_url, + 'ext': original_ext, + 'format_id': format_id, + 'quality': 0 if 'HD' in quality else -1, + 'preference': 1, + }) + + self._sort_formats(formats) + + subtitles = {} + subtitles_data = video_data.find('Subtitles') + if subtitles_data is not None: + subtitles_list = [{ + 'url': sub.text, + 'ext': determine_ext(sub.text), + } for sub in subtitles_data] + if subtitles_list: + subtitles['de'] = subtitles_list + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'title': title, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 1caf08cb7..2381676b4 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import base64 @@ -35,6 +36,17 @@ class TeamcocoIE(InfoExtractor): 'duration': 288, 'age_limit': 0, } + }, { + 'url': 'http://teamcoco.com/video/timothy-olyphant-drinking-whiskey', + 'info_dict': { + 'id': '88748', + 'ext': 'mp4', + 'title': 'Timothy Olyphant Raises A Toast To “Justified”', + 'description': 'md5:15501f23f020e793aeca761205e42c24', + }, + 'params': { + 'skip_download': True, # m3u8 downloads + } } ] _VIDEO_ID_REGEXES = ( @@ -54,10 +66,23 @@ class TeamcocoIE(InfoExtractor): video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') + preload = None preloads = re.findall(r'"preload":\s*"([^"]+)"', webpage) - if not preloads: - raise ExtractorError('Preload information could not be extracted') - preload = max([(len(p), p) for p in preloads])[1] + if preloads: + preload = max([(len(p), p) for p in preloads])[1] + + if not preload: + preload = ''.join(re.findall(r'this\.push\("([^"]+)"\);', webpage)) + + if not preload: + preload = self._html_search_regex([ + r'player,\[?"([^"]+)"\]?', r'player.init\(\[?"([^"]+)"\]?\)' + ], webpage.replace('","', ''), 'preload data', default=None) + + if not preload: + raise ExtractorError( + 'Preload information could not be extracted', expected=True) + data = self._parse_json( base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 2a1ae5a71..828c808a6 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -56,6 +56,6 @@ class TumblrIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': video_title, - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), } diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py index bba25bb58..c08428acf 100644 --- a/youtube_dl/extractor/udn.py +++ b/youtube_dl/extractor/udn.py @@ -3,12 +3,15 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + js_to_json, + ExtractorError, +) from ..compat import compat_urlparse class UDNEmbedIE(InfoExtractor): - _VALID_URL = r'(?:https?:)?//video\.udn\.com/embed/news/(?P<id>\d+)' + _VALID_URL = r'https?://video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)' _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', 'md5': 'de06b4c90b042c128395a88f0384817e', @@ -19,7 +22,11 @@ class UDNEmbedIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', } }, { - 'url': '//video.udn.com/embed/news/300040', + 'url': 'https://video.udn.com/embed/news/300040', + 'only_matching': True, + }, { + # From https://video.udn.com/news/303776 + 'url': 'https://video.udn.com/play/news/303776', 'only_matching': True, }] @@ -47,7 +54,10 @@ class UDNEmbedIE(InfoExtractor): 'retrieve url for %s video' % video_type), 'format_id': video_type, 'preference': 0 if video_type == 'mp4' else -1, - } for video_type, api_url in video_urls.items()] + } for video_type, api_url in video_urls.items() if api_url] + + if not formats: + raise ExtractorError('No videos found', expected=True) self._sort_formats(formats) diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index ee3d86117..aa3d6ddfd 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -1,75 +1,54 @@ -# coding: utf-8 from __future__ import unicode_literals -import base64 -import re -import xml.etree.ElementTree -import zlib - from .common import InfoExtractor from ..utils import int_or_none class VimpleIE(InfoExtractor): - IE_DESC = 'Vimple.ru' - _VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})' + IE_DESC = 'Vimple - one-click video hosting' + _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P<id>[\da-f-]{32,36})' _TESTS = [ { 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', 'md5': '2e750a330ed211d3fd41821c6ad9a279', 'info_dict': { - 'id': 'c0f6b1687dcd4000a97ebe70068039cf', + 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', 'ext': 'mp4', 'title': 'Sunset', 'duration': 20, 'thumbnail': 're:https?://.*?\.jpg', }, - }, + }, { + 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', + 'only_matching': True, + } ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - iframe_url = 'http://player.vimple.ru/iframe/%s' % video_id - - iframe = self._download_webpage( - iframe_url, video_id, - note='Downloading iframe', errnote='unable to fetch iframe') - player_url = self._html_search_regex( - r'"(http://player.vimple.ru/flash/.+?)"', iframe, 'player url') + video_id = self._match_id(url) - player = self._request_webpage( - player_url, video_id, note='Downloading swf player').read() + webpage = self._download_webpage( + 'http://player.vimple.ru/iframe/%s' % video_id, video_id) - player = zlib.decompress(player[8:]) + playlist = self._parse_json( + self._search_regex( + r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'), + video_id)['playlist'][0] - xml_pieces = re.findall(b'([a-zA-Z0-9 =+/]{500})', player) - xml_pieces = [piece[1:-1] for piece in xml_pieces] + title = playlist['title'] + video_id = playlist.get('videoId') or video_id + thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl') + duration = int_or_none(playlist.get('duration')) - xml_data = b''.join(xml_pieces) - xml_data = base64.b64decode(xml_data) - - xml_data = xml.etree.ElementTree.fromstring(xml_data) - - video = xml_data.find('Video') - quality = video.get('quality') - q_tag = video.find(quality.capitalize()) - - formats = [ - { - 'url': q_tag.get('url'), - 'tbr': int(q_tag.get('bitrate')), - 'filesize': int(q_tag.get('filesize')), - 'format_id': quality, - }, - ] + formats = [{ + 'url': f['url'], + } for f in playlist['video']] + self._sort_formats(formats) return { 'id': video_id, - 'title': video.find('Title').text, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, 'formats': formats, - 'thumbnail': video.find('Poster').get('url'), - 'duration': int_or_none(video.get('duration')), - 'webpage_url': video.find('Share').get('videoPageUrl'), } diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py index 448ccc5f3..a5dfc136a 100644 --- a/youtube_dl/postprocessor/atomicparsley.py +++ b/youtube_dl/postprocessor/atomicparsley.py @@ -50,8 +50,13 @@ class AtomicParsleyPP(PostProcessor): msg = stderr.decode('utf-8', 'replace').strip() raise AtomicParsleyPPError(msg) - os.remove(encodeFilename(filename)) os.remove(encodeFilename(temp_thumbnail)) - os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + # for formats that don't support thumbnails (like 3gp) AtomicParsley + # won't create to the temporary file + if b'No changes' in stdout: + self._downloader.report_warning('The file format doesn\'t support embedding a thumbnail') + else: + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return True, info diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 8e99a3c2c..4c4a038f9 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -264,15 +264,14 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): new_path = prefix + sep + extension # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. - if new_path == path: - self._nopostoverwrites = True + if (new_path == path or + (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): + self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path) + return True, information try: - if self._nopostoverwrites and os.path.exists(encodeFilename(new_path)): - self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path) - else: - self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path) - self.run_ffmpeg(path, new_path, acodec, more_opts) + self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path) + self.run_ffmpeg(path, new_path, acodec, more_opts) except AudioConversionError as e: raise PostProcessingError( 'audio conversion failed: ' + e.msg) @@ -286,7 +285,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): errnote='Cannot update utime of audio file') information['filepath'] = new_path - return self._nopostoverwrites, information + return False, information class FFmpegVideoConvertorPP(FFmpegPostProcessor): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52f0dd09a..edeee1853 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -312,17 +312,17 @@ def sanitize_path(s): """Sanitizes and normalizes path on Windows""" if sys.platform != 'win32': return s - drive, _ = os.path.splitdrive(s) - unc, _ = os.path.splitunc(s) - unc_or_drive = unc or drive - norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep) - if unc_or_drive: + drive_or_unc, _ = os.path.splitdrive(s) + if sys.version_info < (2, 7) and not drive_or_unc: + drive_or_unc, _ = os.path.splitunc(s) + norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep) + if drive_or_unc: norm_path.pop(0) sanitized_path = [ path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) for path_part in norm_path] - if unc_or_drive: - sanitized_path.insert(0, unc_or_drive + os.path.sep) + if drive_or_unc: + sanitized_path.insert(0, drive_or_unc + os.path.sep) return os.path.join(*sanitized_path) @@ -452,6 +452,17 @@ def make_HTTPS_handler(params, **kwargs): return YoutubeDLHTTPSHandler(params, context=context, **kwargs) +def bug_reports_message(): + if ytdl_is_updateable(): + update_cmd = 'type youtube-dl -U to update' + else: + update_cmd = 'see https://yt-dl.org/update on how to update' + msg = '; please report this issue on https://yt-dl.org/bug .' + msg += ' Make sure you are using the latest version; %s.' % update_cmd + msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + return msg + + class ExtractorError(Exception): """Error during info extraction.""" @@ -467,13 +478,7 @@ class ExtractorError(Exception): if cause: msg += ' (caused by %r)' % cause if not expected: - if ytdl_is_updateable(): - update_cmd = 'type youtube-dl -U to update' - else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg += '; please report this issue on https://yt-dl.org/bug .' - msg += ' Make sure you are using the latest version; %s.' % update_cmd - msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.' + msg += bug_reports_message() super(ExtractorError, self).__init__(msg) self.traceback = tb diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1095fea2f..3fd0e7e56 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.04.09' +__version__ = '2015.04.17' |