diff options
Diffstat (limited to 'youtube_dl/extractor')
25 files changed, 549 insertions, 265 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5605e917b..4d6aeabdf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -119,7 +119,10 @@ from .mit import TechTVMITIE, MITIE from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE -from .mtv import MTVIE +from .mtv import ( + MTVIE, + MTVIggyIE, +) from .muzu import MuzuTVIE from .myspace import MySpaceIE from .myspass import MySpassIE @@ -152,6 +155,7 @@ from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtlnow import RTLnowIE from .rutube import RutubeIE +from .servingsys import ServingSysIE from .sina import SinaIE from .slashdot import SlashdotIE from .slideshare import SlideshareIE @@ -170,6 +174,7 @@ from .southparkstudios import ( from .space import SpaceIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE +from .spike import SpikeIE from .stanfordoc import StanfordOpenClassroomIE from .statigram import StatigramIE from .steam import SteamIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 8ac38f4aa..e1c45d1f0 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,9 +9,11 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_parse, find_xpath_attr, + fix_xml_ampersands, compat_urlparse, compat_str, compat_urllib_request, + compat_parse_qs, ExtractorError, unsmuggle_url, @@ -83,17 +85,33 @@ class BrightcoveIE(InfoExtractor): lambda m: m.group(1) + '/>', object_str) # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') + object_str = fix_xml_ampersands(object_str) object_doc = xml.etree.ElementTree.fromstring(object_str) - assert 'BrightcoveExperience' in object_doc.attrib['class'] - params = { - 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], - } + + fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') + if fv_el is not None: + flashvars = dict( + (k, v[0]) + for k, v in compat_parse_qs(fv_el.attrib['value']).items()) + else: + flashvars = {} + def find_param(name): + if name in flashvars: + return flashvars[name] node = find_xpath_attr(object_doc, './param', 'name', name) if node is not None: return node.attrib['value'] return None + + params = {} + + playerID = find_param('playerID') + if playerID is None: + raise ExtractorError('Cannot find player ID') + params['playerID'] = playerID + playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: @@ -114,8 +132,12 @@ class BrightcoveIE(InfoExtractor): if it can't be found """ m_brightcove = re.search( - r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', - webpage, re.DOTALL) + r'''(?sx)<object + (?: + [^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1 | + [^>]*?>\s*<param\s+name="movie"\s+value="https?://[^/]*brightcove\.com/ + ).+?</object>''', + webpage) if m_brightcove is not None: return cls._build_brighcove_url(m_brightcove.group()) else: @@ -156,6 +178,7 @@ class BrightcoveIE(InfoExtractor): info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') info = json.loads(info)['data'] video_info = info['programmedContent']['videoPlayer']['mediaDTO'] + video_info['_youtubedl_adServerURL'] = info.get('adServerURL') return self._extract_video_info(video_info) @@ -193,6 +216,23 @@ class BrightcoveIE(InfoExtractor): info.update({ 'url': video_info['FLVFullLengthURL'], }) - else: + + if self._downloader.params.get('include_ads', False): + adServerURL = video_info.get('_youtubedl_adServerURL') + if adServerURL: + ad_info = { + '_type': 'url', + 'url': adServerURL, + } + if 'url' in info: + return { + '_type': 'playlist', + 'title': info['title'], + 'entries': [ad_info, info], + } + else: + return ad_info + + if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index c60089ad3..9ab6a4ab6 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -3,7 +3,7 @@ import re from .common import InfoExtractor from ..utils import ( find_xpath_attr, - fix_xml_all_ampersand, + fix_xml_ampersands ) @@ -33,7 +33,7 @@ class ClipsyndicateIE(InfoExtractor): pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, video_id, u'Downloading video info', - transform_source=fix_xml_all_ampersand) + transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') def find_param(name): diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 27bd8256e..3333d433b 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -15,30 +17,22 @@ class ComedyCentralIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ (video-clips|episodes|cc-studios|video-collections) /(?P<title>.*)''' - _FEED_URL = u'http://comedycentral.com/feeds/mrss/' + _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TEST = { - u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - u'md5': u'4167875aae411f903b751a21f357f1ee', - u'info_dict': { - u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354', - u'ext': u'mp4', - u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother', - u'description': u'After a certain point, breastfeeding becomes c**kblocking.', + 'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', + 'md5': '4167875aae411f903b751a21f357f1ee', + 'info_dict': { + 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'ext': 'mp4', + 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'description': 'After a certain point, breastfeeding becomes c**kblocking.', }, } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"', - webpage, u'mgid') - return self._get_videos_info(mgid) - class ComedyCentralShowsIE(InfoExtractor): - IE_DESC = u'The Daily Show / Colbert Report' + IE_DESC = 'The Daily Show / Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day @@ -55,14 +49,14 @@ class ComedyCentralShowsIE(InfoExtractor): extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?))) $""" _TEST = { - u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', - u'file': u'422212.mp4', - u'md5': u'4e2f5cb088a83cd8cdb7756132f9739d', - u'info_dict': { - u"upload_date": u"20121214", - u"description": u"Kristen Stewart", - u"uploader": u"thedailyshow", - u"title": u"thedailyshow-kristen-stewart part 1" + 'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', + 'file': '422212.mp4', + 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', + 'info_dict': { + "upload_date": "20121214", + "description": "Kristen Stewart", + "uploader": "thedailyshow", + "title": "thedailyshow-kristen-stewart part 1" } } @@ -94,20 +88,20 @@ class ComedyCentralShowsIE(InfoExtractor): def _transform_rtmp_url(rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url) if not m: - raise ExtractorError(u'Cannot transform RTMP url') + raise ExtractorError('Cannot transform RTMP url') base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' return base + m.group('finalid') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) if mobj.group('shortname'): if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = u'http://www.thedailyshow.com/full-episodes/' + url = 'http://www.thedailyshow.com/full-episodes/' else: - url = u'http://www.colbertnation.com/full-episodes/' + url = 'http://www.colbertnation.com/full-episodes/' mobj = re.match(self._VALID_URL, url, re.VERBOSE) assert mobj is not None @@ -133,9 +127,9 @@ class ComedyCentralShowsIE(InfoExtractor): url = htmlHandle.geturl() mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid redirected URL: ' + url) + raise ExtractorError('Invalid redirected URL: ' + url) if mobj.group('episode') == '': - raise ExtractorError(u'Redirected URL is still not specific: ' + url) + raise ExtractorError('Redirected URL is still not specific: ' + url) epTitle = mobj.group('episode') mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) @@ -147,15 +141,15 @@ class ComedyCentralShowsIE(InfoExtractor): altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) if len(altMovieParams) == 0: - raise ExtractorError(u'unable to find Flash URL in webpage ' + url) + raise ExtractorError('unable to find Flash URL in webpage ' + url) else: mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])] uri = mMovieParams[0][1] indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) idoc = self._download_xml(indexUrl, epTitle, - u'Downloading show index', - u'unable to download episode index') + 'Downloading show index', + 'unable to download episode index') results = [] @@ -170,7 +164,7 @@ class ComedyCentralShowsIE(InfoExtractor): configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + compat_urllib_parse.urlencode({'uri': mediaId})) cdoc = self._download_xml(configUrl, epTitle, - u'Downloading configuration for %s' % shortMediaId) + 'Downloading configuration for %s' % shortMediaId) turls = [] for rendition in cdoc.findall('.//rendition'): @@ -178,7 +172,7 @@ class ComedyCentralShowsIE(InfoExtractor): turls.append(finfo) if len(turls) == 0: - self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found') + self._downloader.report_error('unable to download ' + mediaId + ': No videos found') continue formats = [] @@ -192,7 +186,7 @@ class ComedyCentralShowsIE(InfoExtractor): 'width': w, }) - effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1) + effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1) results.append({ 'id': shortMediaId, 'formats': formats, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 692d828da..02a82dc57 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,6 +63,7 @@ class InfoExtractor(object): * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use * filesize The number of bytes, if known in advance @@ -220,6 +221,8 @@ class InfoExtractor(object): webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' else: encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): @@ -236,7 +239,7 @@ class InfoExtractor(object): except AttributeError: url = url_or_request if len(url) > 200: - h = hashlib.md5(url).hexdigest() + h = u'___' + hashlib.md5(url).hexdigest() url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index a2cbd4d8d..e54009622 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,7 +10,7 @@ from ..utils import ( class CSpanIE(InfoExtractor): - _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' + _VALID_URL = r'http://(?:www\.)?c-spanvideo\.org/program/(?P<name>.*)' IE_DESC = 'C-SPAN' _TEST = { 'url': 'http://www.c-spanvideo.org/program/HolderonV', @@ -20,13 +20,14 @@ class CSpanIE(InfoExtractor): 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', }, + 'skip': 'Regularly fails on travis, for unknown reasons', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - prog_name = mobj.group(1) + prog_name = mobj.group('name') webpage = self._download_webpage(url, prog_name) - video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id') + video_id = self._search_regex(r'prog(?:ram)?id=(.*?)&', webpage, 'video id') title = self._html_search_regex( r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title') diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py index a56842b16..6b26ff2e3 100644 --- a/youtube_dl/extractor/d8.py +++ b/youtube_dl/extractor/d8.py @@ -1,22 +1,25 @@ # encoding: utf-8 +from __future__ import unicode_literals + from .canalplus import CanalplusIE class D8IE(CanalplusIE): _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s' - IE_NAME = u'd8.tv' + IE_NAME = 'd8.tv' _TEST = { - u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', - u'file': u'966289.flv', - u'info_dict': { - u'title': u'Campagne intime - Documentaire exceptionnel', - u'description': u'md5:d2643b799fb190846ae09c61e59a859f', - u'upload_date': u'20131108', + 'url': 'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html', + 'file': '966289.flv', + 'info_dict': { + 'title': 'Campagne intime - Documentaire exceptionnel', + 'description': 'md5:d2643b799fb190846ae09c61e59a859f', + 'upload_date': '20131108', }, - u'params': { + 'params': { # rtmp - u'skip_download': True, + 'skip_download': True, }, + 'skip': 'videos get deleted after a while', } diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 4556079c8..8f9154c0e 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,7 +17,12 @@ from ..utils import ( class FacebookIE(InfoExtractor): """Information Extractor for Facebook""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:[^#?]*#!/)?(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' + _VALID_URL = r'''(?x) + (?:https?://)?(?:\w+\.)?facebook\.com/ + (?:[^#?]*\#!/)? + (?:video/video\.php|photo\.php|video/embed)\?(?:.*?) + (?:v|video_id)=(?P<id>[0-9]+) + (?:.*)''' _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -90,7 +95,7 @@ class FacebookIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('ID') + video_id = mobj.group('id') url = 'https://www.facebook.com/video/video.php?v=%s' % video_id webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index d82a5d4b2..66b3b50d4 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,4 +1,4 @@ -import re +from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor @@ -6,22 +6,13 @@ from .mtv import MTVServicesInfoExtractor class GametrailersIE(MTVServicesInfoExtractor): _VALID_URL = r'http://www\.gametrailers\.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _TEST = { - u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', - u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7', - u'info_dict': { - u'title': u'E3 2013: Debut Trailer', - u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', + 'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', + 'file': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', + 'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7', + 'info_dict': { + 'title': 'Mirror\'s Edge 2|E3 2013: Debut Trailer', + 'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, } _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"', - r'data-contentId=\'(?P<mgid>mgid:.*?)\''], - webpage, u'mgid') - return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 839530982..e1933837d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -92,11 +92,12 @@ class GenericIE(InfoExtractor): # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', + 'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4', 'md5': '5644c6ca5d5782c1d0d350dad9bd840c', 'info_dict': { 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', - 'title': '2cc213299525360.mov', #that's what we get + 'title': '2cc213299525360.mov', # that's what we get }, }, ] @@ -161,8 +162,19 @@ class GenericIE(InfoExtractor): def _real_extract(self, url): parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: - self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') - return self.url_result('http://' + url) + default_search = self._downloader.params.get('default_search') + if default_search is None: + default_search = 'auto' + + if default_search == 'auto': + if '/' in url: + self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') + return self.url_result('http://' + url) + else: + return self.url_result('ytsearch:' + url) + else: + assert ':' in default_search + return self.url_result(default_search + url) video_id = os.path.splitext(url.split('/')[-1])[0] self.to_screen('%s: Requesting header' % video_id) @@ -318,6 +330,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Novamov') + # Look for embedded Facebook player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https://www.facebook.com/video/embed.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Facebook') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 0ee74fb38..a106f81d2 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -1,17 +1,25 @@ +from __future__ import unicode_literals + import re import base64 from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, + ExtractorError, + HEADRequest, +) class HotNewHipHopIE(InfoExtractor): _VALID_URL = r'http://www\.hotnewhiphop.com/.*\.(?P<id>.*)\.html' _TEST = { - u'url': u"http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html", - u'file': u'1435540.mp3', - u'md5': u'2c2cd2f76ef11a9b3b581e8b232f3d96', - u'info_dict': { - u"title": u'Freddie Gibbs "Lay It Down"' + 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', + 'file': '1435540.mp3', + 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', + 'info_dict': { + 'title': 'Freddie Gibbs - Lay It Down' } } @@ -21,24 +29,41 @@ class HotNewHipHopIE(InfoExtractor): webpage_src = self._download_webpage(url, video_id) - video_url_base64 = self._search_regex(r'data-path="(.*?)"', - webpage_src, u'video URL', fatal=False) + video_url_base64 = self._search_regex( + r'data-path="(.*?)"', webpage_src, u'video URL', fatal=False) - if video_url_base64 == None: - video_url = self._search_regex(r'"contentUrl" content="(.*?)"', webpage_src, - u'video URL') + if video_url_base64 is None: + video_url = self._search_regex( + r'"contentUrl" content="(.*?)"', webpage_src, u'video URL') return self.url_result(video_url, ie='Youtube') - video_url = base64.b64decode(video_url_base64).decode('utf-8') + reqdata = compat_urllib_parse.urlencode([ + ('mediaType', 's'), + ('mediaId', video_id), + ]) + r = compat_urllib_request.Request( + 'http://www.hotnewhiphop.com/ajax/media/getActions/', data=reqdata) + r.add_header('Content-Type', 'application/x-www-form-urlencoded') + mkd = self._download_json( + r, video_id, note='Requesting media key', + errnote='Could not download media key') + if 'mediaKey' not in mkd: + raise ExtractorError('Did not get a media key') + + redirect_url = base64.b64decode(video_url_base64).decode('utf-8') + redirect_req = HEADRequest(redirect_url) + req = self._request_webpage( + redirect_req, video_id, + note='Resolving final URL', errnote='Could not resolve final URL') + video_url = req.geturl() + if video_url.endswith('.html'): + raise ExtractorError('Redirect failed') - video_title = self._html_search_regex(r"<title>(.*)</title>", - webpage_src, u'title') + video_title = self._og_search_title(webpage_src).strip() - results = [{ - 'id': video_id, - 'url' : video_url, - 'title' : video_title, - 'thumbnail' : self._og_search_thumbnail(webpage_src), - 'ext' : 'mp3', - }] - return results + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'thumbnail': self._og_search_thumbnail(webpage_src), + } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f40769eac..1763af020 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -67,23 +67,16 @@ class ImdbListIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) list_id = mobj.group('id') - - # RSS XML is sometimes malformed - rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, 'Downloading list RSS') - list_title = self._html_search_regex(r'<title>(.*?)</title>', rss, 'list title') - - # Export is independent of actual author_id, but returns 404 if no author_id is provided. - # However, passing dummy author_id seems to be enough. - csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id, - list_id, 'Downloading list CSV') - - entries = [] - for item in csv.split('\n')[1:]: - cols = item.split(',') - if len(cols) < 2: - continue - item_id = cols[1][1:-1] - if item_id.startswith('vi'): - entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb')) - + + webpage = self._download_webpage(url, list_id) + list_code = self._search_regex( + r'(?s)<div\s+class="list\sdetail">(.*?)class="see-more"', + webpage, 'list code') + entries = [ + self.url_result('http://www.imdb.com' + m, 'Imdb') + for m in re.findall(r'href="(/video/imdb/vi[^"]+)"', webpage)] + + list_title = self._html_search_regex( + r'<h1 class="header">(.*?)</h1>', webpage, 'list title') + return self.playlist_result(entries, list_id, list_title) diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index f3ff0e8bb..465ac4916 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( - fix_xml_all_ampersand, + fix_xml_ampersands, ) @@ -27,7 +27,7 @@ class MetacriticIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # The xml is not well formatted, there are raw '&' info = self._download_xml('http://www.metacritic.com/video_data?video=' + video_id, - video_id, 'Downloading info xml', transform_source=fix_xml_all_ampersand) + video_id, 'Downloading info xml', transform_source=fix_xml_ampersands) clip = next(c for c in info.findall('playList/clip') if c.find('id').text == video_id) formats = [] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index f1cf41e2d..f6f31bfdc 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,12 +1,18 @@ +from __future__ import unicode_literals + import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_urllib_parse, ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + url_basename, + RegexNotFoundError, ) + def _media_xml_tag(tag): return '{http://search.yahoo.com/mrss/}%s' % tag @@ -33,10 +39,9 @@ class MTVServicesInfoExtractor(InfoExtractor): else: return thumb_node.attrib['url'] - def _extract_video_formats(self, metadataXml): - if '/error_country_block.swf' in metadataXml: - raise ExtractorError(u'This video is not available from your country.', expected=True) - mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) + def _extract_video_formats(self, mdoc): + if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None: + raise ExtractorError('This video is not available from your country.', expected=True) formats = [] for rendition in mdoc.findall('.//rendition'): @@ -59,11 +64,12 @@ class MTVServicesInfoExtractor(InfoExtractor): self.report_extraction(video_id) mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url) + mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) if 'acceptMethods' not in mediagen_url: mediagen_url += '&acceptMethods=fms' - mediagen_page = self._download_webpage(mediagen_url, video_id, - u'Downloading video urls') + + mediagen_doc = self._download_xml(mediagen_url, video_id, + 'Downloading video urls') description_node = itemdoc.find('description') if description_node is not None: @@ -71,9 +77,23 @@ class MTVServicesInfoExtractor(InfoExtractor): else: description = None + title_el = None + if title_el is None: + title_el = find_xpath_attr( + itemdoc, './/{http://search.yahoo.com/mrss/}category', + 'scheme', 'urn:mtvn:video_title') + if title_el is None: + title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') + if title_el is None: + title_el = itemdoc.find('.//title') + title = title_el.text + if title is None: + raise ExtractorError('Could not find video title') + title = title.strip() + return { - 'title': itemdoc.find('title').text, - 'formats': self._extract_video_formats(mediagen_page), + 'title': title, + 'formats': self._extract_video_formats(mediagen_doc), 'id': video_id, 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, @@ -83,14 +103,25 @@ class MTVServicesInfoExtractor(InfoExtractor): video_id = self._id_from_uri(uri) data = compat_urllib_parse.urlencode({'uri': uri}) - def fix_ampersand(s): - """ Fix unencoded ampersand in XML """ - return s.replace(u'& ', '& ') idoc = self._download_xml( self._FEED_URL + '?' + data, video_id, - u'Downloading info', transform_source=fix_ampersand) + 'Downloading info', transform_source=fix_xml_ampersands) return [self._get_video_info(item) for item in idoc.findall('.//item')] + def _real_extract(self, url): + title = url_basename(url) + webpage = self._download_webpage(url, title) + try: + # the url can be http://media.mtvnservices.com/fb/{mgid}.swf + # or http://media.mtvnservices.com/{mgid} + og_url = self._og_search_video_url(webpage) + mgid = url_basename(og_url) + if mgid.endswith('.swf'): + mgid = mgid[:-4] + except RegexNotFoundError: + mgid = self._search_regex(r'data-mgid="(.*?)"', webpage, u'mgid') + return self._get_videos_info(mgid) + class MTVIE(MTVServicesInfoExtractor): _VALID_URL = r'''(?x)^https?:// @@ -101,25 +132,25 @@ class MTVIE(MTVServicesInfoExtractor): _TESTS = [ { - u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', - u'file': u'853555.mp4', - u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', - u'info_dict': { - u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', - u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'url': 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + 'file': '853555.mp4', + 'md5': '850f3f143316b1e71fa56a4edfd6e0f8', + 'info_dict': { + 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', + 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', }, }, { - u'add_ie': ['Vevo'], - u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', - u'file': u'USCJY1331283.mp4', - u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', - u'info_dict': { - u'title': u'Everything Has Changed', - u'upload_date': u'20130606', - u'uploader': u'Taylor Swift', + 'add_ie': ['Vevo'], + 'url': 'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + 'file': 'USCJY1331283.mp4', + 'md5': '73b4e7fcadd88929292fe52c3ced8caf', + 'info_dict': { + 'title': 'Everything Has Changed', + 'upload_date': '20130606', + 'uploader': 'Taylor Swift', }, - u'skip': u'VEVO is only available in some countries', + 'skip': 'VEVO is only available in some countries', }, ] @@ -138,8 +169,22 @@ class MTVIE(MTVServicesInfoExtractor): webpage, re.DOTALL) if m_vevo: vevo_id = m_vevo.group(1); - self.to_screen(u'Vevo video detected: %s' % vevo_id) + self.to_screen('Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, 'uri') return self._get_videos_info(uri) + + +class MTVIggyIE(MTVServicesInfoExtractor): + IE_NAME = 'mtviggy.com' + _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' + _TEST = { + 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', + 'info_dict': { + 'id': '984696', + 'ext': 'mp4', + 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', + } + } + _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 48ee00da3..6af8d934c 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -19,7 +19,8 @@ class NovamovIE(InfoExtractor): 'info_dict': { 'title': 'search engine optimization', 'description': 'search engine optimization is used to rank the web page in the google search engine' - } + }, + 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' } def _real_extract(self, url): diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 5c4cd2068..4295cf93a 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -6,14 +8,14 @@ from .common import InfoExtractor class RedTubeIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { - u'url': u'http://www.redtube.com/66418', - u'file': u'66418.mp4', + 'url': 'http://www.redtube.com/66418', + 'file': '66418.mp4', # md5 varies from time to time, as in # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295 - #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', - u'info_dict': { - u"title": u"Sucked on a toilet", - u"age_limit": 18, + #'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', + 'info_dict': { + "title": "Sucked on a toilet", + "age_limit": 18, } } @@ -33,14 +35,19 @@ class RedTubeIE(InfoExtractor): r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, u'title') + video_thumbnail = self._html_search_regex( + r'playerInnerHTML.+?<img\s+src="(.+?)"', + webpage, u'thumbnail', fatal=False) + # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 return { - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, + 'thumbnail': video_thumbnail, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 1b08c3167..9fbdb9fcb 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -1,37 +1,44 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?ringtv\.craveonline\.com/videos/video/([^/]+)' + _VALID_URL = r'(?:http://)?(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' _TEST = { - u"url": u"http://ringtv.craveonline.com/videos/video/746619-canelo-alvarez-talks-about-mayweather-showdown", - u"file": u"746619.mp4", - u"md5": u"7c46b4057d22de32e0a539f017e64ad3", - u"info_dict": { - u"title": u"Canelo Alvarez talks about Mayweather showdown", - u"description": u"Saul \\\"Canelo\\\" Alvarez spoke to the media about his Sept. 14 showdown with Floyd Mayweather after their kick-off presser in NYC. Canelo is motivated and confident that he will have the speed and gameplan to beat the pound-for-pound king." + "url": "http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30", + "file": "857645.mp4", + "md5": "d25945f5df41cdca2d2587165ac28720", + "info_dict": { + "title": 'Video: Luis Collazo says Victor Ortiz "better not quit on Jan. 30" - Ring TV', + "description": 'Luis Collazo is excited about his Jan. 30 showdown with fellow former welterweight titleholder Victor Ortiz at Barclays Center in his hometown of Brooklyn. The SuperBowl week fight headlines a Golden Boy Live! card on Fox Sports 1.', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('-')[0] + video_id = mobj.group('id').split('-')[0] webpage = self._download_webpage(url, video_id) - title = self._search_regex(r'<title>(.+?)</title>', - webpage, 'video title').replace(' | RingTV','') - description = self._search_regex(r'<div class="blurb">(.+?)</div>', - webpage, 'Description') - final_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4" %(str(video_id)) - thumbnail_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg" %(str(video_id)) - ext = final_url.split('.')[-1] - return [{ - 'id' : video_id, - 'url' : final_url, - 'ext' : ext, - 'title' : title, - 'thumbnail' : thumbnail_url, - 'description' : description, - }] + + if mobj.group('type') == 'news': + video_id = self._search_regex( + r'''(?x)<iframe[^>]+src="http://cms\.springboardplatform\.com/ + embed_iframe/[0-9]+/video/([0-9]+)/''', + webpage, 'real video ID') + title = self._og_search_title(webpage) + description = self._html_search_regex( + r'addthis:description="([^"]+)"', + webpage, 'description', fatal=False) + final_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/conversion/%s.mp4" % video_id + thumbnail_url = "http://ringtv.craveonline.springboardplatform.com/storage/ringtv.craveonline.com/snapshots/%s.jpg" % video_id + + return { + 'id': video_id, + 'url': final_url, + 'title': title, + 'thumbnail': thumbnail_url, + 'description': description, + } diff --git a/youtube_dl/extractor/servingsys.py b/youtube_dl/extractor/servingsys.py new file mode 100644 index 000000000..1dc551d5c --- /dev/null +++ b/youtube_dl/extractor/servingsys.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, +) + + +class ServingSysIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?serving-sys\.com/BurstingPipe/adServer\.bs\?.*?&pli=(?P<id>[0-9]+)' + + _TEST = { + 'url': 'http://bs.serving-sys.com/BurstingPipe/adServer.bs?cn=is&c=23&pl=VAST&pli=5349193&PluID=0&pos=7135&ord=[timestamp]&cim=1?', + 'playlist': [{ + 'file': '29955898.flv', + 'md5': 'baed851342df6846eb8677a60a011a0f', + 'info_dict': { + 'title': 'AdAPPter_Hyundai_demo (1)', + 'duration': 74, + 'tbr': 1378, + 'width': 640, + 'height': 400, + }, + }, { + 'file': '29907998.flv', + 'md5': '979b4da2655c4bc2d81aeb915a8c5014', + 'info_dict': { + 'title': 'AdAPPter_Hyundai_demo (2)', + 'duration': 34, + 'width': 854, + 'height': 480, + 'tbr': 516, + }, + }], + 'params': { + 'playlistend': 2, + }, + 'skip': 'Blocked in the US [sic]', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + pl_id = mobj.group('id') + + vast_doc = self._download_xml(url, pl_id) + title = vast_doc.find('.//AdTitle').text + media = vast_doc.find('.//MediaFile').text + info_url = self._search_regex(r'&adData=([^&]+)&', media, 'info URL') + + doc = self._download_xml(info_url, pl_id, 'Downloading video info') + entries = [{ + '_type': 'video', + 'id': a.attrib['id'], + 'title': '%s (%s)' % (title, a.attrib['assetID']), + 'url': a.attrib['URL'], + 'duration': int_or_none(a.attrib.get('length')), + 'tbr': int_or_none(a.attrib.get('bitrate')), + 'height': int_or_none(a.attrib.get('height')), + 'width': int_or_none(a.attrib.get('width')), + } for a in doc.findall('.//AdditionalAssets/asset')] + + return { + '_type': 'playlist', + 'id': pl_id, + 'title': title, + 'entries': entries, + } + +
\ No newline at end of file diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index fd90cc5dd..9f8d3a5fa 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -18,14 +18,6 @@ class SouthParkStudiosIE(MTVServicesInfoExtractor): }, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url = u'http://www.' + mobj.group(u'url') - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"', - webpage, u'mgid') - return self._get_videos_info(mgid) class SouthparkDeIE(SouthParkStudiosIE): IE_NAME = u'southpark.de' diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py index 11455e0fa..4a3e52ad8 100644 --- a/youtube_dl/extractor/space.py +++ b/youtube_dl/extractor/space.py @@ -6,7 +6,7 @@ from ..utils import RegexNotFoundError, ExtractorError class SpaceIE(InfoExtractor): - _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' + _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html' _TEST = { u'add_ie': ['Brightcove'], u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py new file mode 100644 index 000000000..56682ac45 --- /dev/null +++ b/youtube_dl/extractor/spike.py @@ -0,0 +1,19 @@ +from __future__ import unicode_literals + +from .mtv import MTVServicesInfoExtractor + + +class SpikeIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+' + _TEST = { + 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', + 'md5': '1a9265f32b0c375793d6c4ce45255256', + 'info_dict': { + 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', + 'ext': 'mp4', + 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', + 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + }, + } + + _FEED_URL = 'http://www.spike.com/feeds/mrss/' diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 90d8a6d07..f13ba1c8e 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re import json @@ -10,19 +12,27 @@ from ..utils import ( class VKIE(InfoExtractor): - IE_NAME = u'vk.com' + IE_NAME = 'vk.com' _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' - _TEST = { - u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - u'md5': u'0deae91935c54e00003c2a00646315f0', - u'info_dict': { - u'id': u'162222515', - u'ext': u'flv', - u'title': u'ProtivoGunz - Хуёвая песня', - u'uploader': u'Noize MC', + _TESTS = [{ + 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', + 'file': '162222515.flv', + 'md5': '0deae91935c54e00003c2a00646315f0', + 'info_dict': { + 'title': 'ProtivoGunz - Хуёвая песня', + 'uploader': 'Noize MC', }, - } + }, + { + 'url': 'http://vk.com/video4643923_163339118', + 'file': '163339118.mp4', + 'md5': 'f79bccb5cd182b1f43502ca5685b2b36', + 'info_dict': { + 'uploader': 'Elvira Dzhonik', + 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -33,13 +43,21 @@ class VKIE(InfoExtractor): if m_yt is not None: self.to_screen(u'Youtube video detected') return self.url_result(m_yt.group(1), 'Youtube') - vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') - vars = json.loads(vars_json) + data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') + data = json.loads(data_json) + + formats = [{ + 'format_id': k, + 'url': v, + 'width': int(k[len('url'):]), + } for k, v in data.items() + if k.startswith('url')] + self._sort_formats(formats) return { - 'id': compat_str(vars['vid']), - 'url': vars['url240'], - 'title': unescapeHTML(vars['md_title']), - 'thumbnail': vars['jpg'], - 'uploader': vars['md_author'], + 'id': compat_str(data['vid']), + 'formats': formats, + 'title': unescapeHTML(data['md_title']), + 'thumbnail': data.get('jpg'), + 'uploader': data.get('md_author'), } diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 82a626e0e..9a6bb0c76 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -1,5 +1,6 @@ +from __future__ import unicode_literals + import re -import base64 from .common import InfoExtractor @@ -7,12 +8,12 @@ from .common import InfoExtractor class WimpIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/' _TEST = { - u'url': u'http://www.wimp.com/deerfence/', - u'file': u'deerfence.flv', - u'md5': u'8b215e2e0168c6081a1cf84b2846a2b5', - u'info_dict': { - u"title": u"Watch Till End: Herd of deer jump over a fence.", - u"description": u"These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", + 'url': 'http://www.wimp.com/deerfence/', + 'file': 'deerfence.flv', + 'md5': '8b215e2e0168c6081a1cf84b2846a2b5', + 'info_dict': { + "title": "Watch Till End: Herd of deer jump over a fence.", + "description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.", } } @@ -20,13 +21,12 @@ class WimpIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - googleString = self._search_regex("googleCode = '(.*?)'", webpage, 'file url') - googleString = base64.b64decode(googleString).decode('ascii') - final_url = self._search_regex('","(.*?)"', googleString, u'final video url') + video_url = self._search_regex( + r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL') return { 'id': video_id, - 'url': final_url, + 'url': video_url, 'title': self._og_search_title(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 90138d7e5..85e99e1b0 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,12 +11,12 @@ from ..utils import ( class XVideosIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' _TEST = { - u'url': u'http://www.xvideos.com/video939581/funny_porns_by_s_-1', - u'file': u'939581.flv', - u'md5': u'1d0c835822f0a71a7bf011855db929d0', - u'info_dict': { - u"title": u"Funny Porns By >>>>S<<<<<< -1", - u"age_limit": 18, + 'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1', + 'file': '939581.flv', + 'md5': '1d0c835822f0a71a7bf011855db929d0', + 'info_dict': { + "title": "Funny Porns By >>>>S<<<<<< -1", + "age_limit": 18, } } @@ -27,18 +29,18 @@ class XVideosIE(InfoExtractor): self.report_extraction(video_id) # Extract video URL - video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&', - webpage, u'video URL')) + video_url = compat_urllib_parse.unquote( + self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) # Extract title - video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID', - webpage, u'title') + video_title = self._html_search_regex( + r'<title>(.*?)\s+-\s+XVID', webpage, 'title') # Extract video thumbnail - video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', - webpage, u'thumbnail', fatal=False) + video_thumbnail = self._search_regex( + r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) - info = { + return { 'id': video_id, 'url': video_url, 'uploader': None, @@ -49,5 +51,3 @@ class XVideosIE(InfoExtractor): 'description': None, 'age_limit': 18, } - - return [info] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dd1a58f3f..57b8fdff7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,6 +27,7 @@ from ..utils import ( get_element_by_id, get_element_by_attribute, ExtractorError, + int_or_none, PagedList, RegexNotFoundError, unescapeHTML, @@ -271,6 +272,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"setindia" } }, + { + u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", + u"file": u"a9LDPn-MO4I.m4a", + u"note": u"256k DASH audio (format 141) via DASH manifest", + u"params": { + u"format": "141" + }, + u"info_dict": { + u"upload_date": "20121002", + u"uploader_id": "8KVIDEO", + u"description": "No description available.", + u"uploader": "8KVIDEO", + u"title": "UHDTV TEST 8K VIDEO.mp4" + } + }, ] @@ -1068,18 +1084,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id = mobj.group(2) return video_id - def _get_video_url_list(self, url_map): - """ - Transform a dictionary in the format {itag:url} to a list of (itag, url) - with the requested formats. - """ - existing_formats = [x for x in self._formats if x in url_map] - if len(existing_formats) == 0: - raise ExtractorError(u'no known formats available for video') - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - video_url_list.reverse() # order worst to best - return video_url_list - def _extract_from_m3u8(self, manifest_url, video_id): url_map = {} def _get_urls(_manifest): @@ -1253,7 +1257,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_annotations = self._extract_annotations(video_id) # Decide which formats to download - try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) if not mobj: @@ -1278,9 +1281,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): except ValueError: pass + def _map_to_format_list(urlmap): + formats = [] + for itag, video_real_url in urlmap.items(): + dct = { + 'format_id': itag, + 'url': video_real_url, + 'player_url': player_url, + } + dct.update(self._formats[itag]) + formats.append(dct) + return formats + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() - video_url_list = [('_rtmp', video_info['conn'][0])] + formats = [{ + 'format_id': '_rtmp', + 'protocol': 'rtmp', + 'url': video_info['conn'][0], + 'player_url': player_url, + }] elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1: encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0] if 'rtmpe%3Dyes' in encoded_url_map: @@ -1325,23 +1345,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] url_map = self._extract_from_m3u8(manifest_url, video_id) - video_url_list = self._get_video_url_list(url_map) + formats = _map_to_format_list(url_map) else: raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') - formats = [] - for itag, video_real_url in video_url_list: - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - dct.update(self._formats[itag]) - formats.append(dct) + # Look for the DASH manifest + dash_manifest_url_lst = video_info.get('dashmpd') + if dash_manifest_url_lst and dash_manifest_url_lst[0]: + try: + dash_doc = self._download_xml( + dash_manifest_url_lst[0], video_id, + note=u'Downloading DASH manifest', + errnote=u'Could not download DASH manifest') + for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') + if url_el is None: + continue + format_id = r.attrib['id'] + video_url = url_el.text + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) + f = { + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(r.attrib.get('width')), + 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), + 'asr': int_or_none(r.attrib.get('audioSamplingRate')), + 'filesize': filesize, + } + try: + existing_format = next( + fo for fo in formats + if fo['format_id'] == format_id) + except StopIteration: + f.update(self._formats.get(format_id, {})) + formats.append(f) + else: + existing_format.update(f) + + except (ExtractorError, KeyError) as e: + self.report_warning(u'Skipping DASH manifest: %s' % e, video_id) self._sort_formats(formats) |