diff options
25 files changed, 321 insertions, 125 deletions
@@ -56,6 +56,10 @@ which means you can modify it, redistribute it or use it however you like. --date DATE download only videos uploaded in this date --datebefore DATE download only videos uploaded before this date --dateafter DATE download only videos uploaded after this date + --min-views COUNT Do not download any videos with less than COUNT + views + --max-views COUNT Do not download any videos with more than COUNT + views --no-playlist download only the currently playing video --age-limit YEARS download only videos suitable for the given age --download-archive FILE Download only videos not listed in the archive @@ -127,6 +131,7 @@ which means you can modify it, redistribute it or use it however you like. --get-id simulate, quiet but print id --get-thumbnail simulate, quiet but print thumbnail URL --get-description simulate, quiet but print video description + --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format -j, --dump-json simulate, quiet but print JSON information diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 58cf9c313..3100c362a 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -7,6 +7,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL +from youtube_dl import YoutubeDL class YDL(FakeYDL): @@ -140,6 +141,20 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(test_dict['extractor'], 'Foo') self.assertEqual(test_dict['playlist'], 'funny videos') + def test_prepare_filename(self): + info = { + u'id': u'1234', + u'ext': u'mp4', + u'width': None, + } + def fname(templ): + ydl = YoutubeDL({'outtmpl': templ}) + return ydl.prepare_filename(info) + self.assertEqual(fname(u'%(id)s.%(ext)s'), u'1234.mp4') + self.assertEqual(fname(u'%(id)s-%(width)s.%(ext)s'), u'1234-NA.mp4') + # Replace missing fields with 'NA' + self.assertEqual(fname(u'%(uploader_date)s-%(id)s.%(ext)s'), u'NA-1234.mp4') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2dd7e4907..2a4ab674d 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -3,6 +3,7 @@ from __future__ import absolute_import +import collections import errno import io import json @@ -33,6 +34,7 @@ from .utils import ( encodeFilename, ExtractorError, format_bytes, + formatSeconds, get_term_width, locked_file, make_HTTPS_handler, @@ -93,6 +95,7 @@ class YoutubeDL(object): forcethumbnail: Force printing thumbnail URL. forcedescription: Force printing description. forcefilename: Force printing final filename. + forceduration: Force printing duration. forcejson: Force printing info_dict as JSON. simulate: Do not download the video files. format: Video format code. @@ -126,7 +129,16 @@ class YoutubeDL(object): noplaylist: Download single video instead of a playlist if in doubt. age_limit: An integer representing the user's age in years. Unsuitable videos for the given age are skipped. - download_archive: File name of a file where all downloads are recorded. + min_views: An integer representing the minimum view count the video + must have in order to not be skipped. + Videos without view count information are always + downloaded. None for no limit. + max_views: An integer representing the maximum view count. + Videos that are more popular than that are not + downloaded. + Videos without view count information are always + downloaded. None for no limit. + download_archive: File name of a file where all downloads are recorded. Videos already present in the file are not downloaded again. cookiefile: File name where cookies should be read from and dumped to. @@ -354,22 +366,6 @@ class YoutubeDL(object): error_message = u'%s %s' % (_msg_header, message) self.trouble(error_message, tb) - def report_writedescription(self, descfn): - """ Report that the description file is being written """ - self.to_screen(u'[info] Writing video description to: ' + descfn) - - def report_writesubtitles(self, sub_filename): - """ Report that the subtitles file is being written """ - self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) - - def report_writeinfojson(self, infofn): - """ Report that the metadata file has been written """ - self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) - - def report_writeannotations(self, annofn): - """ Report that the annotations file has been written. """ - self.to_screen(u'[info] Writing video annotations to: ' + annofn) - def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" try: @@ -396,18 +392,17 @@ class YoutubeDL(object): template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] sanitize = lambda k, v: sanitize_filename( - u'NA' if v is None else compat_str(v), + compat_str(v), restricted=self.params.get('restrictfilenames'), is_id=(k == u'id')) template_dict = dict((k, sanitize(k, v)) - for k, v in template_dict.items()) + for k, v in template_dict.items() + if v is not None) + template_dict = collections.defaultdict(lambda: u'NA', template_dict) tmpl = os.path.expanduser(self.params['outtmpl']) filename = tmpl % template_dict return filename - except KeyError as err: - self.report_error(u'Erroneous output template') - return None except ValueError as err: self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')') return None @@ -415,13 +410,14 @@ class YoutubeDL(object): def _match_entry(self, info_dict): """ Returns None iff the file should be downloaded """ + video_title = info_dict.get('title', info_dict.get('id', u'video')) if 'title' in info_dict: # This can happen when we're just evaluating the playlist title = info_dict['title'] matchtitle = self.params.get('matchtitle', False) if matchtitle: if not re.search(matchtitle, title, re.IGNORECASE): - return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + return u'"' + title + '" title did not match pattern "' + matchtitle + '"' rejecttitle = self.params.get('rejecttitle', False) if rejecttitle: if re.search(rejecttitle, title, re.IGNORECASE): @@ -430,14 +426,21 @@ class YoutubeDL(object): if date is not None: dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: - return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + view_count = info_dict.get('view_count', None) + if view_count is not None: + min_views = self.params.get('min_views') + if min_views is not None and view_count < min_views: + return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) + max_views = self.params.get('max_views') + if max_views is not None and view_count > max_views: + return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) age_limit = self.params.get('age_limit') if age_limit is not None: if age_limit < info_dict.get('age_limit', 0): return u'Skipping "' + title + '" because it is age restricted' if self.in_download_archive(info_dict): - return (u'%s has already been recorded in archive' - % info_dict.get('title', info_dict.get('id', u'video'))) + return u'%s has already been recorded in archive' % video_title return None @staticmethod @@ -748,6 +751,8 @@ class YoutubeDL(object): self.to_stdout(info_dict['description']) if self.params.get('forcefilename', False) and filename is not None: self.to_stdout(filename) + if self.params.get('forceduration', False) and info_dict.get('duration') is not None: + self.to_stdout(formatSeconds(info_dict['duration'])) if self.params.get('forceformat', False): self.to_stdout(info_dict['format']) if self.params.get('forcejson', False): @@ -770,28 +775,34 @@ class YoutubeDL(object): return if self.params.get('writedescription', False): - try: - descfn = filename + u'.description' - self.report_writedescription(descfn) - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: - descfile.write(info_dict['description']) - except (KeyError, TypeError): - self.report_warning(u'There\'s no description to write.') - except (OSError, IOError): - self.report_error(u'Cannot write description file ' + descfn) - return + descfn = filename + u'.description' + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): + self.to_screen(u'[info] Video description is already present') + else: + try: + self.to_screen(u'[info] Writing video description to: ' + descfn) + with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + descfile.write(info_dict['description']) + except (KeyError, TypeError): + self.report_warning(u'There\'s no description to write.') + except (OSError, IOError): + self.report_error(u'Cannot write description file ' + descfn) + return if self.params.get('writeannotations', False): - try: - annofn = filename + u'.annotations.xml' - self.report_writeannotations(annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: - annofile.write(info_dict['annotations']) - except (KeyError, TypeError): - self.report_warning(u'There are no annotations to write.') - except (OSError, IOError): - self.report_error(u'Cannot write annotations file: ' + annofn) - return + annofn = filename + u'.annotations.xml' + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): + self.to_screen(u'[info] Video annotations are already present') + else: + try: + self.to_screen(u'[info] Writing video annotations to: ' + annofn) + with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + annofile.write(info_dict['annotations']) + except (KeyError, TypeError): + self.report_warning(u'There are no annotations to write.') + except (OSError, IOError): + self.report_error(u'Cannot write annotations file: ' + annofn) + return subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -807,38 +818,48 @@ class YoutubeDL(object): continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) - self.report_writesubtitles(sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: - subfile.write(sub) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): + self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) + else: + self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub) except (OSError, IOError): self.report_error(u'Cannot write subtitles file ' + descfn) return if self.params.get('writeinfojson', False): infofn = os.path.splitext(filename)[0] + u'.info.json' - self.report_writeinfojson(infofn) - try: - json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) - write_json_file(json_info_dict, encodeFilename(infofn)) - except (OSError, IOError): - self.report_error(u'Cannot write metadata to JSON file ' + infofn) - return + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + self.to_screen(u'[info] Video description metadata is already present') + else: + self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn) + try: + json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) + write_json_file(json_info_dict, encodeFilename(infofn)) + except (OSError, IOError): + self.report_error(u'Cannot write metadata to JSON file ' + infofn) + return if self.params.get('writethumbnail', False): if info_dict.get('thumbnail') is not None: thumb_format = determine_ext(info_dict['thumbnail'], u'jpg') - thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format - self.to_screen(u'[%s] %s: Downloading thumbnail ...' % - (info_dict['extractor'], info_dict['id'])) - try: - uf = compat_urllib_request.urlopen(info_dict['thumbnail']) - with open(thumb_filename, 'wb') as thumbf: - shutil.copyfileobj(uf, thumbf) - self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % - (info_dict['extractor'], info_dict['id'], thumb_filename)) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_warning(u'Unable to download thumbnail "%s": %s' % - (info_dict['thumbnail'], compat_str(err))) + thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): + self.to_screen(u'[%s] %s: Thumbnail is already present' % + (info_dict['extractor'], info_dict['id'])) + else: + self.to_screen(u'[%s] %s: Downloading thumbnail ...' % + (info_dict['extractor'], info_dict['id'])) + try: + uf = compat_urllib_request.urlopen(info_dict['thumbnail']) + with open(thumb_filename, 'wb') as thumbf: + shutil.copyfileobj(uf, thumbf) + self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % + (info_dict['extractor'], info_dict['id'], thumb_filename)) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self.report_warning(u'Unable to download thumbnail "%s": %s' % + (info_dict['thumbnail'], compat_str(err))) if not self.params.get('skip_download', False): if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3e82cd637..0775b72fd 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -37,6 +37,7 @@ __authors__ = ( 'Anton Larionov', 'Takuya Tsuchida', 'Sergey M.', + 'Michael Orlitzky', ) __license__ = 'Public Domain' @@ -62,6 +63,7 @@ from .utils import ( MaxDownloadsReached, preferredencoding, SameFileError, + setproctitle, std_headers, write_string, ) @@ -210,6 +212,14 @@ def parseOpts(overrideArguments=None): selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None) selection.add_option('--datebefore', metavar='DATE', dest='datebefore', help='download only videos uploaded before this date', default=None) selection.add_option('--dateafter', metavar='DATE', dest='dateafter', help='download only videos uploaded after this date', default=None) + selection.add_option( + '--min-views', metavar='COUNT', dest='min_views', + default=None, type=int, + help="Do not download any videos with less than COUNT views",) + selection.add_option( + '--max-views', metavar='COUNT', dest='max_views', + default=None, type=int, + help="Do not download any videos with more than COUNT views",) selection.add_option('--no-playlist', action='store_true', dest='noplaylist', help='download only the currently playing video', default=False) selection.add_option('--age-limit', metavar='YEARS', dest='age_limit', help='download only videos suitable for the given age', @@ -290,6 +300,9 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--get-description', action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False) + verbosity.add_option('--get-duration', + action='store_true', dest='getduration', + help='simulate, quiet but print video length', default=False) verbosity.add_option('--get-filename', action='store_true', dest='getfilename', help='simulate, quiet but print output filename', default=False) @@ -460,12 +473,15 @@ def parseOpts(overrideArguments=None): return parser, opts, args + def _real_main(argv=None): # Compatibility fixes for Windows if sys.platform == 'win32': # https://github.com/rg3/youtube-dl/issues/820 codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + setproctitle(u'youtube-dl') + parser, opts, args = parseOpts(argv) # Set user agent @@ -604,27 +620,30 @@ def _real_main(argv=None): or (opts.useid and u'%(id)s.%(ext)s') or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') or u'%(title)s-%(id)s.%(ext)s') - if '%(ext)s' not in outtmpl and opts.extractaudio: + if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error(u'Cannot download a video and extract audio into the same' - u' file! Use "%%(ext)s" instead of %r' % - determine_ext(outtmpl, u'')) + u' file! Use "{0}.%(ext)s" instead of "{0}" as the output' + u' template'.format(outtmpl)) + + any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson ydl_opts = { 'usenetrc': opts.usenetrc, 'username': opts.username, 'password': opts.password, 'videopassword': opts.videopassword, - 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson), + 'quiet': (opts.quiet or any_printing), 'forceurl': opts.geturl, 'forcetitle': opts.gettitle, 'forceid': opts.getid, 'forcethumbnail': opts.getthumbnail, 'forcedescription': opts.getdescription, + 'forceduration': opts.getduration, 'forcefilename': opts.getfilename, 'forceformat': opts.getformat, 'forcejson': opts.dumpjson, 'simulate': opts.simulate, - 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson), + 'skip_download': (opts.skip_download or opts.simulate or any_printing), 'format': opts.format, 'format_limit': opts.format_limit, 'listformats': opts.listformats, @@ -668,6 +687,8 @@ def _real_main(argv=None): 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, + 'min_views': opts.min_views, + 'max_views': opts.max_views, 'daterange': date, 'cachedir': opts.cachedir, 'youtube_print_sig_code': opts.youtube_print_sig_code, diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb206a742..f01fa2cde 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .brightcove import BrightcoveIE from .c56 import C56IE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .cbs import CBSIE from .channel9 import Channel9IE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE @@ -112,6 +113,7 @@ from .orf import ORFIE from .pbs import PBSIE from .photobucket import PhotobucketIE from .podomatic import PodomaticIE +from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE from .pyvideo import PyvideoIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a527f10de..ef5644aa5 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor): uploader_id = mobj.group('company') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) - playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # with xml.etree.ElementTree.fromstring - # like: http://trailers.apple.com/trailers/wb/gravity/ - def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) - playlist_html = u'<html>' + playlist_cleaned + u'</html>' + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) + s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = u'<html>' + s + u'</html>' + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py new file mode 100644 index 000000000..ac0315853 --- /dev/null +++ b/youtube_dl/extractor/cbs.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor + + +class CBSIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/video/(?P<id>[^/]+)/.*' + + _TEST = { + u'url': u'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + u'file': u'4JUVEwq3wUT7.flv', + u'info_dict': { + u'title': u'Connect Chat feat. Garth Brooks', + u'description': u'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', + u'duration': 1495, + }, + u'params': { + # rtmp download + u'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + real_id = self._search_regex( + r"video\.settings\.pid\s*=\s*'([^']+)';", + webpage, u'real video ID') + return self.url_result(u'theplatform:%s' % real_id) diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d4fc86973..c60089ad3 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,9 +1,9 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( find_xpath_attr, + fix_xml_all_ampersand, ) @@ -30,12 +30,10 @@ class ClipsyndicateIE(InfoExtractor): # it includes a required token flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') - playlist_page = self._download_webpage( + pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info') - # Fix broken xml - playlist_page = re.sub('&', '&', playlist_page) - pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8')) + video_id, u'Downloading video info', + transform_source=fix_xml_all_ampersand) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 534908a2b..fe8ce9e6c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -34,8 +34,8 @@ class InfoExtractor(object): The dictionaries must include the following fields: id: Video identifier. - url: Final video URL. title: Video title, unescaped. + url: Final video URL. ext: Video filename extension. Instead of url and ext, formats can also specified. @@ -54,6 +54,7 @@ class InfoExtractor(object): player_url: SWF Player URL (used for rtmpdump). subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. + duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video @@ -230,9 +231,12 @@ class InfoExtractor(object): return content def _download_xml(self, url_or_request, video_id, - note=u'Downloading XML', errnote=u'Unable to download XML'): + note=u'Downloading XML', errnote=u'Unable to download XML', + transform_source=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) def to_screen(self, msg): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3bd0b862c..6685c94a3 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -28,7 +28,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)' + _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P<id>[^/?_]+)' IE_NAME = u'dailymotion' _FORMATS = [ @@ -81,7 +81,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): # Extract id and simplified title from URL mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('_')[0].split('?')[0] + video_id = mobj.group('id') url = 'http://www.dailymotion.com/video/%s' % video_id @@ -101,10 +101,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self.to_screen(u'Vevo video detected: %s' % vevo_id) return self.url_result(u'vevo:%s' % vevo_id, ie='Vevo') - video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], - webpage, 'video uploader', fatal=False) age_limit = self._rta_search(webpage) video_upload_date = None @@ -147,13 +143,15 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): self._list_available_subtitles(video_id, webpage) return - view_count = str_to_int(self._search_regex( - r'video_views_value[^>]+>([\d\.,]+)<', webpage, u'view count')) + view_count = self._search_regex( + r'video_views_count[^>]+>\s+([\d\.,]+)', webpage, u'view count', fatal=False) + if view_count is not None: + view_count = str_to_int(view_count) return { 'id': video_id, 'formats': formats, - 'uploader': video_uploader, + 'uploader': info['owner_screenname'], 'upload_date': video_upload_date, 'title': self._og_search_title(webpage), 'subtitles': video_subtitles, diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index d418ce4a8..4876ecb48 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -9,7 +9,7 @@ from ..utils import ( class DaumIE(InfoExtractor): - _VALID_URL = r'https?://tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' IE_NAME = u'daum.net' _TEST = { diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 57b79a336..381af91e4 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -44,7 +44,7 @@ class IGNIE(InfoExtractor): { u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', u'info_dict': { - u'title': u'GTA 5\'s Twisted Beauty in Super Slow Motion', + u'title': u'26 Twisted Moments from GTA 5 in Slow Motion', u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', }, }, diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 6b95b4998..e560c1d35 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -1,8 +1,10 @@ import re -import xml.etree.ElementTree import operator from .common import InfoExtractor +from ..utils import ( + fix_xml_all_ampersand, +) class MetacriticIE(InfoExtractor): @@ -23,9 +25,8 @@ class MetacriticIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' - info_xml = self._download_webpage('http://www.metacritic.com/video_data?video=' + video_id, - video_id, u'Downloading info xml').replace('&', '&') - info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) + info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, + video_id, u'Downloading info xml', transform_source=fix_xml_all_ampersand) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 04fa3ac7a..125d81551 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -37,6 +37,9 @@ class MixcloudIE(InfoExtractor): return None + def _get_url(self, template_url): + return self.check_urls(template_url % i for i in range(30)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -52,13 +55,18 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') template_url = re.sub(r'(stream\d*)', 'stream%d', song_url) - final_song_url = self.check_urls(template_url % i for i in range(30)) + final_song_url = self._get_url(template_url) + if final_song_url is None: + self.to_screen('Trying with m4a extension') + template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') + final_song_url = self._get_url(template_url) + if final_song_url is None: + raise ExtractorError(u'Unable to extract track url') return { 'id': track_id, 'title': info['name'], 'url': final_song_url, - 'ext': 'mp3', 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 6b3feb560..5b2bd9633 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -82,8 +82,13 @@ class MTVServicesInfoExtractor(InfoExtractor): def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - idoc = self._download_xml(self._FEED_URL +'?' + data, video_id, - u'Downloading info') + + def fix_ampersand(s): + """ Fix unencoded ampersand in XML """ + return s.replace(u'& ', '& ') + idoc = self._download_xml( + self._FEED_URL + '?' + data, video_id, + u'Downloading info', transform_source=fix_ampersand) return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c012ec0cf..4cab30631 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -9,7 +9,7 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://tvcast\.naver\.com/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' _TEST = { u'url': u'http://tvcast.naver.com/v/81652', diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py new file mode 100644 index 000000000..71abd5013 --- /dev/null +++ b/youtube_dl/extractor/pornhd.py @@ -0,0 +1,38 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urllib_parse + + +class PornHdIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' + _TEST = { + u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', + u'file': u'1962.flv', + u'md5': u'35272469887dca97abd30abecc6cdf75', + u'info_dict': { + u"title": u"sierra-day-gets-his-cum-all-over-herself-hd-porn-video", + u"age_limit": 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('video_id') + video_title = mobj.group('video_title') + + webpage = self._download_webpage(url, video_id) + + video_url = self._html_search_regex( + r'&hd=(http.+?)&', webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) + age_limit = 18 + + return { + 'id': video_id, + 'url': video_url, + 'ext': 'flv', + 'title': video_title, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8b3471919..d9135c6b9 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -12,7 +12,7 @@ from ..aes import ( ) class PornHubIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9]+))' + _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>pornhub\.com/view_video\.php\?viewkey=(?P<videoid>[0-9a-f]+))' _TEST = { u'url': u'http://www.pornhub.com/view_video.php?viewkey=648719015', u'file': u'648719015.mp4', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5c026c0b8..cbba4094b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -73,6 +73,19 @@ class SoundcloudIE(InfoExtractor): u'upload_date': u'20131209', }, }, + # downloadable song + { + u'url': u'https://soundcloud.com/simgretina/just-your-problem-baby-1', + u'md5': u'56a8b69568acaa967b4c49f9d1d52d19', + u'info_dict': { + u'id': u'105614606', + u'ext': u'wav', + u'title': u'Just Your Problem Baby (Acapella)', + u'description': u'Vocals', + u'uploader': u'Sim Gretina', + u'upload_date': u'20130815', + }, + }, ] _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -99,7 +112,7 @@ class SoundcloudIE(InfoExtractor): thumbnail = info['artwork_url'] if thumbnail is not None: thumbnail = thumbnail.replace('-large', '-t500x500') - ext = info.get('original_format', u'mp3') + ext = u'mp3' result = { 'id': track_id, 'uploader': info['user']['username'], @@ -115,7 +128,7 @@ class SoundcloudIE(InfoExtractor): track_id, self._CLIENT_ID)) result['formats'] = [{ 'format_id': 'download', - 'ext': ext, + 'ext': info.get('original_format', u'mp3'), 'url': format_url, 'vcodec': 'none', }] diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 61452e47d..cec65261b 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -3,6 +3,7 @@ import json from .common import InfoExtractor from ..utils import ( + ExtractorError, xpath_with_ns, ) @@ -32,6 +33,17 @@ class ThePlatformIE(InfoExtractor): smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) meta = self._download_xml(smil_url, video_id) + + try: + error_msg = next( + n.attrib['abstract'] + for n in meta.findall(_x('.//smil:ref')) + if n.attrib.get('title') == u'Geographic Restriction') + except StopIteration: + pass + else: + raise ExtractorError(error_msg, expected=True) + info_url = 'http://link.theplatform.com/s/dJ5BDC/{0}?format=preview'.format(video_id) info_json = self._download_webpage(info_url, video_id) info = json.loads(info_json) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fb2bd225a..ea4409528 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -115,7 +115,7 @@ class VimeoIE(InfoExtractor): def _real_initialize(self): self._login() - def _real_extract(self, url, new_video=True): + def _real_extract(self, url): url, data = unsmuggle_url(url) headers = std_headers if data is not None: @@ -151,8 +151,14 @@ class VimeoIE(InfoExtractor): config = json.loads(config_json) except RegexNotFoundError: # For pro videos or player.vimeo.com urls - config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'], - webpage, u'info section', flags=re.DOTALL) + # We try to find out to which variable is assigned the config dic + m_variable_name = re.search('(\w)\.video\.id', webpage) + if m_variable_name is not None: + config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1)) + else: + config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] + config = self._search_regex(config_re, webpage, u'info section', + flags=re.DOTALL) config = json.loads(config) except Exception as e: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 874429b78..a68a214ca 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1377,9 +1377,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'length_seconds' not in video_info: self._downloader.report_warning(u'unable to extract video duration') - video_duration = '' + video_duration = None else: - video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) + video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) # annotations video_annotations = None diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 689f19735..35ece354a 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,14 +73,14 @@ class ZDFIE(InfoExtractor): try: proto_pref = -PROTO_ORDER.index(format_m.group('proto')) except ValueError: - proto_pref = 999 + proto_pref = -999 quality = fnode.find('./quality').text QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] try: quality_pref = -QUALITY_ORDER.index(quality) except ValueError: - quality_pref = 999 + quality_pref = -999 abr = int(fnode.find('./audioBitrate').text) // 1000 vbr = int(fnode.find('./videoBitrate').text) // 1000 diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0dab9fcc5..bd46a2da2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import ctypes import datetime import email.utils import errno @@ -1051,9 +1052,28 @@ def month_by_name(name): """ Return the number of a month by (locale-independently) English name """ ENGLISH_NAMES = [ - u'Januar', u'February', u'March', u'April', u'May', u'June', + u'January', u'February', u'March', u'April', u'May', u'June', u'July', u'August', u'September', u'October', u'November', u'December'] try: return ENGLISH_NAMES.index(name) + 1 except ValueError: return None + + +def fix_xml_all_ampersand(xml_str): + """Replace all the '&' by '&' in XML""" + return xml_str.replace(u'&', u'&') + + +def setproctitle(title): + try: + libc = ctypes.cdll.LoadLibrary("libc.so.6") + except OSError: + return + title = title + buf = ctypes.create_string_buffer(len(title) + 1) + buf.value = title + try: + libc.prctl(15, ctypes.byref(buf), 0, 0, 0) + except AttributeError: + return # Strange libc, just skip this diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8906d6090..5bc7fd774 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.09.4' +__version__ = '2013.12.16' |