diff options
-rw-r--r-- | test/test_download.py | 4 | ||||
-rw-r--r-- | youtube_dl/__init__.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/__init__.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/mit.py | 135 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/nbc.py | 24 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 14 |
7 files changed, 148 insertions, 40 deletions
diff --git a/test/test_download.py b/test/test_download.py index ff571c48f..bbbb6b78a 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -73,9 +73,7 @@ def generator(test_case): if 'playlist' not in test_case: info_dict = test_case.get('info_dict', {}) if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')): - print_skipping('The output file cannot be know, the "file" ' - 'key is missing or the info_dict is incomplete') - return + raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') if 'skip' in test_case: print_skipping(test_case['skip']) return diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2aaafd37a..910f3f4b9 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -47,12 +47,14 @@ __authors__ = ( 'Michael Kaiser', 'Niklas Laxström', 'David Triendl', + 'Anthony Weems', ) __license__ = 'Public Domain' import codecs import getpass +import io import locale import optparse import os diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8eff3df41..e16bbd969 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -137,7 +137,7 @@ from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE -from .mit import TechTVMITIE, MITIE +from .mit import TechTVMITIE, MITIE, OCWMITIE from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE @@ -152,7 +152,10 @@ from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE -from .nbc import NBCNewsIE +from .nbc import ( + NBCIE, + NBCNewsIE, +) from .ndr import NDRIE from .ndtv import NDTVIE from .newgrounds import NewgroundsIE diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 76b717fe5..06d331bbc 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -1,24 +1,29 @@ +from __future__ import unicode_literals + import re import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( + compat_urlparse, clean_html, get_element_by_id, ) class TechTVMITIE(InfoExtractor): - IE_NAME = u'techtv.mit.edu' + IE_NAME = 'techtv.mit.edu' _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' _TEST = { - u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', - u'file': u'25418.mp4', - u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', - u'info_dict': { - u'title': u'MIT DNA Learning Center Set', - u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + 'md5': '1f8cb3e170d41fd74add04d3c9330e5f', + 'info_dict': { + 'id': '25418', + 'ext': 'mp4', + 'title': 'MIT DNA Learning Center Set', + 'description': 'md5:82313335e8a8a3f243351ba55bc1b474', }, } @@ -27,12 +32,12 @@ class TechTVMITIE(InfoExtractor): video_id = mobj.group('id') raw_page = self._download_webpage( 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page) + clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) - base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', - raw_page, u'base url') - formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, - u'video formats') + base_url = self._search_regex( + r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url') + formats_json = self._search_regex( + r'bitrates: (\[.+?\])', raw_page, 'video formats') formats_mit = json.loads(formats_json) formats = [ { @@ -48,28 +53,32 @@ class TechTVMITIE(InfoExtractor): title = get_element_by_id('edit-title', clean_page) description = clean_html(get_element_by_id('edit-description', clean_page)) - thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', - raw_page, u'thumbnail', flags=re.DOTALL) + thumbnail = self._search_regex( + r'playlist:.*?url: \'(.+?)\'', + raw_page, 'thumbnail', flags=re.DOTALL) - return {'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - } + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + } class MITIE(TechTVMITIE): - IE_NAME = u'video.mit.edu' + IE_NAME = 'video.mit.edu' _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' _TEST = { - u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', - u'file': u'21783.mp4', - u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', - u'info_dict': { - u'title': u'The Government is Profiling You', - u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + 'file': '.mp4', + 'md5': '7db01d5ccc1895fc5010e9c9e13648da', + 'info_dict': { + 'id': '21783', + 'ext': 'mp4', + 'title': 'The Government is Profiling You', + 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd', }, } @@ -77,7 +86,73 @@ class MITIE(TechTVMITIE): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) - embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, - u'embed url') + embed_url = self._search_regex( + r'<iframe .*?src="(.+?)"', webpage, 'embed url') return self.url_result(embed_url, ie='TechTVMIT') + + +class OCWMITIE(InfoExtractor): + IE_NAME = 'ocw.mit.edu' + _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _BASE_URL = 'http://ocw.mit.edu/' + + _TESTS = [ + { + 'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/', + 'info_dict': { + 'id': 'EObHWIEKGjA', + 'ext': 'mp4', + 'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence', + 'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', + #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt' + } + }, + { + 'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/', + 'info_dict': { + 'id': '7K1sB05pE0A', + 'ext': 'mp4', + 'title': 'Session 1: Introduction to Derivatives', + 'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.', + #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT' + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + topic = mobj.group('topic') + + webpage = self._download_webpage(url, topic) + title = self._html_search_meta('WT.cg_s', webpage) + description = self._html_search_meta('Description', webpage) + + # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file) + embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage) + if embed_chapter_media: + metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1)) + metadata = re.split(r', ?', metadata) + yt = metadata[1] + subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7]) + else: + # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file) + embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage) + if embed_media: + metadata = re.sub(r'[\'"]', '', embed_media.group(1)) + metadata = re.split(r', ?', metadata) + yt = metadata[1] + subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5]) + else: + raise ExtractorError('Unable to find embedded YouTube video.') + video_id = YoutubeIE.extract_id(yt) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'description': description, + 'url': yt, + 'url_transparent' + 'subtitles': subs, + 'ie_key': 'Youtube', + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f3356db50..b8d363eb6 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -10,7 +10,7 @@ from ..utils import ( class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' IE_NAME = 'mixcloud' _TEST = { diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index ff750de3f..1a63ab56a 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -6,6 +6,30 @@ from .common import InfoExtractor from ..utils import find_xpath_attr, compat_str +class NBCIE(InfoExtractor): + _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)' + + _TEST = { + 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188', + 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e', + 'info_dict': { + 'id': 'u1RInQZRN7QJ', + 'ext': 'flv', + 'title': 'I Am a Firefighter', + 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url') + if theplatform_url.startswith('//'): + theplatform_url = 'http:' + theplatform_url + return self.url_result(theplatform_url) + + class NBCNewsIE(InfoExtractor): _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index d60702325..91f2453eb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -13,7 +13,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/ - (?P<config>[^/\?]+/(?:swf|config)/select/)? + (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? |theplatform:)(?P<id>[^/\?&]+)''' _TEST = { @@ -54,10 +54,15 @@ class ThePlatformIE(InfoExtractor): f4m_node = body.find(_x('smil:seq/smil:video')) if f4m_node is not None: + f4m_url = f4m_node.attrib['src'] + if 'manifest.f4m?' not in f4m_url: + f4m_url += '?' + # the parameters are from syfy.com, other sites may use others, + # they also work for nbc.com + f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' formats = [{ 'ext': 'flv', - # the parameters are from syfy.com, other sites may use others - 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3', + 'url': f4m_url, }] else: base_url = head.find(_x('smil:meta')).attrib['base'] @@ -95,9 +100,10 @@ class ThePlatformIE(InfoExtractor): if mobj.group('config'): config_url = url+ '&form=json' config_url = config_url.replace('swf/', 'config/') + config_url = config_url.replace('onsite/', 'onsite/config/') config_json = self._download_webpage(config_url, video_id, u'Downloading config') config = json.loads(config_json) - smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4' + smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' else: smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) |