diff options
Diffstat (limited to 'youtube_dl')
-rw-r--r-- | youtube_dl/YoutubeDL.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/appletrailers.py | 23 | ||||
-rw-r--r-- | youtube_dl/extractor/clipsyndicate.py | 10 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/dailymotion.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/daum.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/metacritic.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/mtv.py | 9 | ||||
-rw-r--r-- | youtube_dl/extractor/naver.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/pornhub.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/vimeo.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/zdf.py | 4 | ||||
-rw-r--r-- | youtube_dl/utils.py | 5 | ||||
-rw-r--r-- | youtube_dl/version.py | 2 |
16 files changed, 87 insertions, 51 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2dd7e4907..c77777ba0 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -3,6 +3,7 @@ from __future__ import absolute_import +import collections import errno import io import json @@ -396,18 +397,17 @@ class YoutubeDL(object): template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] sanitize = lambda k, v: sanitize_filename( - u'NA' if v is None else compat_str(v), + compat_str(v), restricted=self.params.get('restrictfilenames'), is_id=(k == u'id')) template_dict = dict((k, sanitize(k, v)) - for k, v in template_dict.items()) + for k, v in template_dict.items() + if v is not None) + template_dict = collections.defaultdict(lambda: u'NA', template_dict) tmpl = os.path.expanduser(self.params['outtmpl']) filename = tmpl % template_dict return filename - except KeyError as err: - self.report_error(u'Erroneous output template') - return None except ValueError as err: self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')') return None @@ -827,7 +827,7 @@ class YoutubeDL(object): if self.params.get('writethumbnail', False): if info_dict.get('thumbnail') is not None: thumb_format = determine_ext(info_dict['thumbnail'], u'jpg') - thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format + thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format self.to_screen(u'[%s] %s: Downloading thumbnail ...' % (info_dict['extractor'], info_dict['id'])) try: diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a527f10de..ef5644aa5 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor): uploader_id = mobj.group('company') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) - playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # with xml.etree.ElementTree.fromstring - # like: http://trailers.apple.com/trailers/wb/gravity/ - def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) - playlist_html = u'<html>' + playlist_cleaned + u'</html>' + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) + s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = u'<html>' + s + u'</html>' + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d4fc86973..c60089ad3 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( find_xpath_attr, + fix_xml_all_ampersand, ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor): # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') - playlist_page = self._download_webpage( + pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info') - # Fix broken xml - playlist_page = re.sub('&', '&', playlist_page) - pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + video_id, u'Downloading video info', + transform_source=fix_xml_all_ampersand) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 534908a2b..69a083b68 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -230,9 +230,12 @@ class InfoExtractor(object): return content def _download_xml(self, url_or_request, video_id, - note=u'Downloading XML', errnote=u'Unable to download XML'): + note=u'Downloading XML', errnote=u'Unable to download XML', + transform_source=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) def to_screen(self, msg): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3bd0b862c..aea7e557e 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'Vevo video detected: %s' % vevo_id) return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') - video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], - webpage, 'video uploader', fatal=False) age_limit = self._rta_search(webpage) video_upload_date = None @@ -147,13 +143,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - view_count = str_to_int(self._search_regex( - r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) + view_count = self._search_regex( + r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False) + if view_count is not None: + view_count = str_to_int(view_count) return { 'id': video_id, 'formats': formats, - 'uploader': video_uploader, + 'uploader': info['owner_screenname'], 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index d418ce4a8..4876ecb48 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -9,7 +9,7 @@ from ..utils import ( class DaumIE(InfoExtractor): - _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' IE_NAME = u'daum.net' _TEST = { diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 6b95b4998..e560c1d35 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@ import re -import xml.etree.ElementTree import operator from .common import InfoExtractor +from ..utils import ( + fix_xml_all_ampersand, +) class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' - info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, - video_id, u'Downloading info xml').replace('&', '&') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 04fa3ac7a..125d81551 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -37,6 +37,9 @@ class MixcloudIE(InfoExtractor): return None + def _get_url(self, template_url): + return self.check_urls(template_url % i for i in range(30)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -52,13 +55,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self.check_urls(template_url % i for i in range(30)) + final_song_url = self._get_url(template_url) + if final_song_url is None: + self.to_screen('Trying with m4a extension') + template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') + final_song_url = self._get_url(template_url) + if final_song_url is None: + raise ExtractorError(u'Unable to extract track url') return { 'id': track_id, 'title': info['name'], 'url': final_song_url, - 'ext': 'mp3', 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 6b3feb560..5b2bd9633 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor): def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, - u'Downloading info') + + def fix_ampersand(s): + """ Fix unencoded ampersand in XML """ + return s.replace(u'& ', '& ') + idoc = self._download_xml( + self._FEED_URL + '?' + data, video_id, + u'Downloading info', transform_source=fix_ampersand) return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c012ec0cf..4cab30631 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -9,7 +9,7 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TEST = { u'url': u'http://tvcast.naver.com/v/81652', diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8b3471919..d9135c6b9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -12,7 +12,7 @@ from ..aes import ( ) class PornHubIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))' _TEST = { u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015', u'file': u'648719015.mp4', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5c026c0b8..cbba4094b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -73,6 +73,19 @@ class SoundcloudIE(InfoExtractor): u'upload_date': u'20131209', }, }, + # downloadable song + { + u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1', + u'md5': u'56a8b69568acaa967b4c49f9d1d52d19', + u'info_dict': { + u'id': u'105614606', + u'ext': u'wav', + u'title': u'Just Your Problem Baby (Acapella)', + u'description': u'Vocals', + u'uploader': u'Sim Gretina', + u'upload_date': u'20130815', + }, + }, ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -99,7 +112,7 @@ class SoundcloudIE(InfoExtractor): thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') - ext = info.get('original_format', u'mp3') + ext = u'mp3' result = { 'id': track_id, 'uploader': info['user']['username'], @@ -115,7 +128,7 @@ class SoundcloudIE(InfoExtractor): track_id, self._CLIENT_ID)) result['formats'] = [{ 'format_id': 'download', - 'ext': ext, + 'ext': info.get('original_format', u'mp3'), 'url': format_url, 'vcodec': 'none', }] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fb2bd225a..ea4409528 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url, new_video=True): + def _real_extract(self, url): url, data = unsmuggle_url(url) headers = std_headers if data is not None: @@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor): config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], - webpage, u'info section', flags=re.DOTALL) + # We try to find out to which variable is assigned the config dic + m_variable_name = re.search('(\w)\.video\.id', webpage) + if m_variable_name is not None: + config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) + else: + config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config = self._search_regex(config_re, webpage, u'info section', + flags=re.DOTALL) config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 689f19735..35ece354a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor): try: proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - proto_pref = 999 + proto_pref = -999 quality = fnode.find('./quality').text QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: quality_pref = -QUALITY_ORDER.index(quality) except ValueError: - quality_pref = 999 + quality_pref = -999 abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d9bf6c24c..4e8a84a56 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1057,3 +1057,8 @@ def month_by_name(name): return ENGLISH_NAMES.index(name) + 1 except ValueError: return None + + +def fix_xml_all_ampersand(xml_str): + """Replace all the '&' by '&' in XML""" + return xml_str.replace(u'&', u'&') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8906d6090..b9a52fcfa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09.4' +__version__ = '2013.12.11.2' |