diff options
24 files changed, 299 insertions, 209 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7b177e343..7a2a09ab0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .canalplus import CanalplusIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE from .criterion import CriterionIE @@ -37,6 +38,7 @@ from .jukebox import JukeboxIE from .justintv import JustinTVIE from .keek import KeekIE from .liveleak import LiveLeakIE +from .livestream import LivestreamIE from .metacafe import MetacafeIE from .mixcloud import MixcloudIE from .mtv import MTVIE diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py new file mode 100644 index 000000000..3b1c88876 --- /dev/null +++ b/youtube_dl/extractor/canalplus.py @@ -0,0 +1,46 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import unified_strdate + +class CanalplusIE(InfoExtractor): + _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' + IE_NAME = u'canalplus.fr' + + _TEST = { + u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', + u'file': u'889861.flv', + u'md5': u'590a888158b5f0d6832f84001fbf3e99', + u'info_dict': { + u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', + u'upload_date': u'20130620', + }, + u'skip': u'Requires rtmpdump' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id + info_page = self._download_webpage(info_url,video_id, + u'Downloading video info') + + self.report_extraction(video_id) + doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) + video_info = [video for video in doc if video.find('ID').text == video_id][0] + infos = video_info.find('INFOS') + media = video_info.find('MEDIA') + formats = [media.find('VIDEOS/%s' % format) + for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] + video_url = [format.text for format in formats if format is not None][-1] + + return {'id': video_id, + 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'url': video_url, + 'ext': 'flv', + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1bd5538ca..ec988fc90 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -125,6 +125,11 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): """ Returns a tuple (page content as string, URL handle) """ + + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -257,6 +262,30 @@ class InfoExtractor(object): return (username, password) + # Helper functions for extracting OpenGraph info + @staticmethod + def _og_regex(prop): + return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) + + def _og_search_property(self, prop, html, name=None, **kargs): + if name is None: + name = 'OpenGraph %s' % prop + return self._html_search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) + + def _og_search_thumbnail(self, html, **kargs): + return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) + + def _og_search_description(self, html, **kargs): + return self._og_search_property('description', html, fatal=False, **kargs) + + def _og_search_title(self, html, **kargs): + return self._og_search_property('title', html, **kargs) + + def _og_search_video_url(self, html, name='video url', **kargs): + return self._html_search_regex([self._og_regex('video:secure_url'), + self._og_regex('video')], + html, name, **kargs) + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index a149d2900..31fe3d57b 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -3,38 +3,38 @@ import re from .common import InfoExtractor +from ..utils import determine_ext class CriterionIE(InfoExtractor): - _VALID_URL = r'http://www.criterion.com/films/(.*)' + _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+' _TEST = { u'url': u'http://www.criterion.com/films/184-le-samourai', u'file': u'184.mp4', u'md5': u'bc51beba55685509883a9a7830919ec3', u'info_dict': { u"title": u"Le Samouraï", - u"description" : u"In a career-defining performance, Alain Delon plays a contract killer with samurai instincts. A razor-sharp cocktail of 1940s American gangster cinema and 1960s French pop culture, maverick director Jean-Pierre Melville's masterpiece _Le Samouraï_ defines cool. " + u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1).split('-')[0] + video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') - title = self._search_regex(r'<meta content="(.+?)" property="og:title" />', + title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />', webpage, 'video title') - description = self._search_regex(r'<meta name="description" content="(.+?)" />', + description = self._html_search_regex(r'<meta name="description" content="(.+?)" />', webpage, 'video description') thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') - ext = final_url.split('.')[-1] return {'id': video_id, 'url' : final_url, 'title': title, - 'ext': ext, + 'ext': determine_ext(final_url), 'description': description, 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index a4853279b..7bf03c584 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor): description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"', webpage, 'description', flags=re.MULTILINE|re.DOTALL) - thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"', - webpage, 'thumbnail') url = self._search_regex(r'<string name="URL">(.*?)</string>', video_info, 'video url') @@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor): 'url': url, 'play_path': path, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 5fd2221a7..9bf7a28ca 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -39,9 +39,6 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />', - webpage, 'title') - video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], @@ -76,7 +73,7 @@ class DailymotionIE(InfoExtractor): 'url': video_url, 'uploader': video_uploader, 'upload_date': video_upload_date, - 'title': video_title, + 'title': self._og_search_title(webpage), 'ext': video_extension, 'thumbnail': info['thumbnail_url'] }] diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index 1f0b3888e..2bb77aec6 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -28,14 +28,9 @@ class EHowIE(InfoExtractor): video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)', webpage, u'video URL') final_url = compat_urllib_parse.unquote(video_url) - thumbnail_url = self._search_regex(r'<meta property="og:image" content="(.+?)" />', - webpage, u'thumbnail URL') uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />', webpage, u'uploader') - title = self._search_regex(r'<meta property="og:title" content="(.+?)" />', - webpage, u'Video title').replace(' | eHow', '') - description = self._search_regex(r'<meta property="og:description" content="(.+?)" />', - webpage, u'video description') + title = self._og_search_title(webpage).replace(' | eHow', '') ext = determine_ext(final_url) return { @@ -44,8 +39,8 @@ class EHowIE(InfoExtractor): 'url': final_url, 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), 'uploader': uploader, } diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 794460e84..3aa2da52c 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor): videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"', webpage, u'description', fatal=False) - imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"', - webpage, u'thumbnail', fatal=False) - - playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"', - webpage, u'player url') + playerUrl = self._og_search_video_url(webpage, name='player url') title = self._html_search_regex('<meta name="title" content="([^"]*)"', webpage, u'player url').split(' : ')[-1] @@ -70,7 +66,7 @@ class EscapistIE(InfoExtractor): 'upload_date': None, 'title': title, 'ext': 'mp4', - 'thumbnail': imgUrl, + 'thumbnail': self._og_search_thumbnail(webpage), 'description': videoDesc, 'player_url': playerUrl, } diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index bd97bff9a..80d96baf7 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -47,21 +47,12 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', - webpage, u'video title') - - video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', - webpage, u'description', fatal=False) - - thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', - webpage, u'thumbnail', fatal=False) - return [{ 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'description': video_description, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id': video_uploader_id, }] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 388aacf2f..67a7e5f76 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor): title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL) - video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', - webpage, u'description', fatal=False, flags=re.DOTALL) - info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'description': video_description, + 'description': self._og_search_description(webpage), } return [info] diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index cd438bd2f..3cc02d97e 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,63 +1,36 @@ import re -import xml.etree.ElementTree -from .common import InfoExtractor -from ..utils import ( - compat_urllib_parse, +from .mtv import MTVIE, _media_xml_tag - ExtractorError, -) - -class GametrailersIE(InfoExtractor): +class GametrailersIE(MTVIE): + """ + Gametrailers use the same videos system as MTVIE, it just changes the feed + url, where the uri is and the method to get the thumbnails. + """ _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)' _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.flv', - u'md5': u'c3edbc995ab4081976e16779bd96a878', + u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', + u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7', u'info_dict': { - u"title": u"E3 2013: Debut Trailer" + u'title': u'E3 2013: Debut Trailer', + u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, - u'skip': u'Requires rtmpdump' } + # Overwrite MTVIE properties we don't want + _TESTS = [] + + _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"', r'data-contentId=\'(?P<mgid>mgid:.*?)\''], webpage, u'mgid') - - data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) - info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, - video_id, u'Downloading video info') - doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) - default_thumb = doc.find('./channel/image/url').text - - media_namespace = {'media': 'http://search.yahoo.com/mrss/'} - parts = [{ - 'title': video_doc.find('title').text, - 'ext': 'flv', - 'id': video_doc.find('guid').text.rpartition(':')[2], - # Videos are actually flv not mp4 - 'url': self._get_video_url(video_doc.find('media:group/media:content', media_namespace).attrib['url'], video_id), - # The thumbnail may not be defined, it would be '' - 'thumbnail': video_doc.find('media:group/media:thumbnail', media_namespace).attrib['url'] or default_thumb, - 'description': video_doc.find('description').text, - } for video_doc in doc.findall('./channel/item')] - return parts - - def _get_video_url(self, mediagen_url, video_id): - if 'acceptMethods' not in mediagen_url: - mediagen_url += '&acceptMethods=fms' - links_webpage = self._download_webpage(mediagen_url, - video_id, u'Downloading video urls info') - doc = xml.etree.ElementTree.fromstring(links_webpage) - urls = list(doc.iter('src')) - if len(urls) == 0: - raise ExtractorError(u'Unable to extract video url') - # They are sorted from worst to best quality - return urls[-1].text - + return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ca3abb7d7..ccca1d7e0 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor): video_title = self._html_search_regex(r"<title>(.*)</title>", webpage_src, u'title') - - # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex(r'"og:image" content="(.*)"', - webpage_src, u'thumbnail', fatal=False) results = [{ 'id': video_id, 'url' : video_url, 'title' : video_title, - 'thumbnail' : thumbnail, + 'thumbnail' : self._og_search_thumbnail(webpage_src), 'ext' : 'mp3', }] - return results
\ No newline at end of file + return results diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 6ae704efd..f9ac8d5b4 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class InstagramIE(InfoExtractor): _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/' _TEST = { - u'url': u'http://instagram.com/p/aye83DjauH/#', + u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc', u'file': u'aye83DjauH.mp4', u'md5': u'0d2da106a9d2631273e192b372806516', u'info_dict': { @@ -18,25 +18,20 @@ class InstagramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'<meta property="og:video" content="(.+?)"', - webpage, u'video URL') - thumbnail_url = self._html_search_regex( - r'<meta property="og:image" content="(.+?)" />', - webpage, u'thumbnail URL', fatal=False) html_title = self._html_search_regex( r'<title>(.+?)</title>', webpage, u'title', flags=re.DOTALL) title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip() - uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram', - webpage, u'uploader name', fatal=False) + uploader_id = self._html_search_regex( + r'<div class="media-user" id="media_user">.*?<h2><a href="[^"]*">([^<]*)</a></h2>', + webpage, u'uploader id', fatal=False, flags=re.DOTALL) ext = 'mp4' return [{ 'id': video_id, - 'url': video_url, + 'url': self._og_search_video_url(webpage), 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id' : uploader_id }] diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index 72ad6a3d0..dda78743d 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -24,8 +24,7 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', - webpage, u'title') + video_title = self._og_search_title(webpage) uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index cf8a2c931..dd062a14e 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', - webpage, u'title').replace('LiveLeak.com -', '').strip() + video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() - video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', - webpage, u'description', fatal=False) + video_description = self._og_search_description(webpage) video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py new file mode 100644 index 000000000..309921078 --- /dev/null +++ b/youtube_dl/extractor/livestream.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import compat_urllib_parse_urlparse, compat_urlparse + + +class LivestreamIE(InfoExtractor): + _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$' + _TEST = { + u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', + u'file': u'4719370.mp4', + u'md5': u'0d2186e3187d185a04b3cdd02b828836', + u'info_dict': { + u'title': u'Live from Webster Hall NYC', + u'upload_date': u'20121012', + } + } + + def _extract_video_info(self, video_data): + video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') + return {'id': video_data['id'], + 'url': video_url, + 'ext': 'mp4', + 'title': video_data['caption'], + 'thumbnail': video_data['thumbnail_url'], + 'upload_date': video_data['updated_at'].replace('-','')[:8], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + event_name = mobj.group('event_name') + webpage = self._download_webpage(url, video_id or event_name) + + if video_id is None: + # This is an event page: + api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', + webpage, 'api url') + info = json.loads(self._download_webpage(api_url, event_name, + u'Downloading event info')) + videos = [self._extract_video_info(video_data['data']) + for video_data in info['feed']['data'] if video_data['type'] == u'video'] + return self.playlist_result(videos, info['id'], info['full_name']) + else: + og_video = self._og_search_video_url(webpage, name=u'player url') + query_str = compat_urllib_parse_urlparse(og_video).query + query = compat_urlparse.parse_qs(query_str) + api_url = query['play_url'][0].replace('.smil', '') + info = json.loads(self._download_webpage(api_url, video_id, + u'Downloading video info')) + return self._extract_video_info(info) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 969db7113..8f956571d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,28 +1,110 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, - compat_urllib_request, - + compat_urllib_parse, ExtractorError, ) +def _media_xml_tag(tag): + return '{http://search.yahoo.com/mrss/}%s' % tag class MTVIE(InfoExtractor): - _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' - _WORKING = False + _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$' + + _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' + + _TESTS = [ + { + u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + u'file': u'853555.mp4', + u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', + u'info_dict': { + u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', + u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + }, + }, + { + u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + u'file': u'USCJY1331283.mp4', + u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', + u'info_dict': { + u'title': u'Everything Has Changed', + u'upload_date': u'20130606', + u'uploader': u'Taylor Swift', + }, + u'skip': u'VEVO is only available in some countries', + }, + ] + + @staticmethod + def _id_from_uri(uri): + return uri.split(':')[-1] + + # This was originally implemented for ComedyCentral, but it also works here + @staticmethod + def _transform_rtmp_url(rtmp_video_url): + m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url) + if not m: + raise ExtractorError(u'Cannot transform RTMP url') + base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' + return base + m.group('finalid') + + def _get_thumbnail_url(self, uri, itemdoc): + return 'http://mtv.mtvnimages.com/uri/' + uri + + def _extract_video_url(self, metadataXml): + if '/error_country_block.swf' in metadataXml: + raise ExtractorError(u'This video is not available from your country.', expected=True) + mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) + renditions = mdoc.findall('.//rendition') + + # For now, always pick the highest quality. + rendition = renditions[-1] + + try: + _,_,ext = rendition.attrib['type'].partition('/') + format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] + rtmp_video_url = rendition.find('./src').text + except KeyError: + raise ExtractorError('Invalid rendition field.') + video_url = self._transform_rtmp_url(rtmp_video_url) + return {'ext': ext, 'url': video_url, 'format': format} + + def _get_video_info(self, itemdoc): + uri = itemdoc.find('guid').text + video_id = self._id_from_uri(uri) + self.report_extraction(video_id) + mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + if 'acceptMethods' not in mediagen_url: + mediagen_url += '&acceptMethods=fms' + mediagen_page = self._download_webpage(mediagen_url, video_id, + u'Downloading video urls') + video_info = self._extract_video_url(mediagen_page) + + description_node = itemdoc.find('description') + if description_node is not None: + description = description_node.text + else: + description = None + video_info.update({'title': itemdoc.find('title').text, + 'id': video_id, + 'thumbnail': self._get_thumbnail_url(uri, itemdoc), + 'description': description, + }) + return video_info + + def _get_videos_info(self, uri): + video_id = self._id_from_uri(uri) + data = compat_urllib_parse.urlencode({'uri': uri}) + infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id, + u'Downloading info') + idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8')) + return [self._get_video_info(item) for item in idoc.findall('.//item')] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - if not mobj.group('proto'): - url = 'http://' + url video_id = mobj.group('videoid') webpage = self._download_webpage(url, video_id) @@ -35,46 +117,5 @@ class MTVIE(InfoExtractor): self.to_screen(u'Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - #song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', - # webpage, u'song name', fatal=False) - - video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', - webpage, u'title') - - mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', - webpage, u'mtvn_uri', fatal=False) - - content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', - webpage, u'content id', fatal=False) - - videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri - self.report_extraction(video_id) - request = compat_urllib_request.Request(videogen_url) - try: - metadataXml = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err)) - - mdoc = xml.etree.ElementTree.fromstring(metadataXml) - renditions = mdoc.findall('.//rendition') - - # For now, always pick the highest quality. - rendition = renditions[-1] - - try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - video_url = rendition.find('./src').text - except KeyError: - raise ExtractorError('Invalid rendition field.') - - info = { - 'id': video_id, - 'url': video_url, - 'upload_date': None, - 'title': video_title, - 'ext': ext, - 'format': format, - } - - return [info] + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + return self._get_videos_info(uri) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 122b7dd26..0f178905b 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -30,8 +30,7 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', - webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') + title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index ae9a63e8b..b8e6b3bf9 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'<meta property="og:video:secure_url" content="(.+?)">', - webpage, u'video URL') - thumbnail_url = self._html_search_regex( - r'<meta property="og:image" content="(.+?)" />', - webpage, u'thumbnail URL', fatal=False) html_title = self._html_search_regex( r'<title>(.+?)</title>', webpage, u'title') @@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor): return [{ 'id': video_id, - 'url': video_url, + 'url': self._og_search_video_url(webpage), 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id' : uploader_id }] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 1dd5e1b68..ec92e589a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -30,15 +30,6 @@ class TeamcocoIE(InfoExtractor): self.report_extraction(video_id) - video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', - webpage, u'title') - - thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"', - webpage, u'thumbnail', fatal=False) - - video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"', - webpage, u'description', fatal=False) - data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id data = self._download_webpage(data_url, video_id, 'Downloading data webpage') @@ -49,7 +40,7 @@ class TeamcocoIE(InfoExtractor): 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'description': video_description, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), }] diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 9dd26c163..324bb6231 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -24,11 +24,8 @@ class TrailerAddictIE(InfoExtractor): webpage, 'video title').replace(' - Trailer Addict','') view_count = self._search_regex(r'Views: (.+?)<br />', webpage, 'Views Count') - description = self._search_regex(r'<meta property="og:description" content="(.+?)" />', - webpage, 'video description') - video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />', - webpage, 'Video id').split('=')[1] - + video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] + info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id)) info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") @@ -44,6 +41,6 @@ class TrailerAddictIE(InfoExtractor): 'ext' : ext, 'title' : title, 'thumbnail' : thumbnail_url, - 'description' : description, + 'description' : self._og_search_description(webpage), 'view_count' : view_count, }] diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index fcaa6ac01..4e404fbf5 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -22,8 +22,6 @@ class TutvIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<meta property="og:title" content="(.*?)">', webpage, u'title') internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) @@ -36,6 +34,6 @@ class TutvIE(InfoExtractor): 'id': internal_id, 'url': video_url, 'ext': ext, - 'title': title, + 'title': self._og_search_title(webpage), } return [info] diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index bdd3522eb..c4ec1f06f 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,12 +27,6 @@ class VineIE(InfoExtractor): video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"', webpage, u'video URL') - video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"', - webpage, u'title') - - thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"', - webpage, u'thumbnail', fatal=False) - uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, u'uploader', fatal=False, flags=re.DOTALL) @@ -40,7 +34,7 @@ class VineIE(InfoExtractor): 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader': uploader, }] diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 6f022670c..1265639e8 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', - webpage, u'video URL') + m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage) + if m_playlist is not None: + playlist_url = m_playlist.group('playlist') + playlist_page = self._download_webpage(playlist_url, video_id, + u'Downloading playlist page') + m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page)) + if len(m_levels) == 0: + raise ExtractorError(u'Unable to extract video url') + videos = [(int(m.group(1)), m.group(2)) for m in m_levels] + (_, video_url) = sorted(videos)[0] + video_url = video_url.replace('%252F', '%2F') + else: + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, |