diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/__init__.py | 5 | ||||
-rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 6 | ||||
-rw-r--r-- | youtube_dl/extractor/mit.py | 68 | ||||
-rw-r--r-- | youtube_dl/extractor/mixcloud.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/nbc.py | 24 | ||||
-rw-r--r-- | youtube_dl/extractor/theplatform.py | 14 |
6 files changed, 78 insertions, 41 deletions
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ee081b1f3..e16bbd969 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -152,7 +152,10 @@ from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE -from .nbc import NBCNewsIE +from .nbc import ( + NBCIE, + NBCNewsIE, +) from .ndr import NDRIE from .ndtv import NDTVIE from .newgrounds import NewgroundsIE diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 5587ade12..026a9177e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -86,9 +86,8 @@ class CrunchyrollIE(InfoExtractor): return zlib.decompress(decrypted_data) def _convert_subtitles_to_srt(self, subtitles): - i = 1 output = '' - for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles): + for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1): start = start.replace('.', ',') end = end.replace('.', ',') text = clean_html(text) @@ -96,7 +95,6 @@ class CrunchyrollIE(InfoExtractor): if not text: continue output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) - i += 1 return output def _real_extract(self,url): @@ -187,4 +185,4 @@ class CrunchyrollIE(InfoExtractor): 'upload_date': video_upload_date, 'subtitles': subtitles, 'formats': formats, - }
\ No newline at end of file + } diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 7c40cb8bd..f1db3744f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -10,16 +12,17 @@ from ..utils import ( class TechTVMITIE(InfoExtractor): - IE_NAME = u'techtv.mit.edu' + IE_NAME = 'techtv.mit.edu' _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)' _TEST = { - u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', - u'file': u'25418.mp4', - u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', - u'info_dict': { - u'title': u'MIT DNA Learning Center Set', - u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + 'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + 'md5': '1f8cb3e170d41fd74add04d3c9330e5f', + 'info_dict': { + 'id': '25418', + 'ext': 'mp4', + 'title': 'MIT DNA Learning Center Set', + 'description': 'md5:82313335e8a8a3f243351ba55bc1b474', }, } @@ -28,12 +31,12 @@ class TechTVMITIE(InfoExtractor): video_id = mobj.group('id') raw_page = self._download_webpage( 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page) + clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page) - base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', - raw_page, u'base url') - formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, - u'video formats') + base_url = self._search_regex( + r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url') + formats_json = self._search_regex( + r'bitrates: (\[.+?\])', raw_page, 'video formats') formats_mit = json.loads(formats_json) formats = [ { @@ -49,28 +52,32 @@ class TechTVMITIE(InfoExtractor): title = get_element_by_id('edit-title', clean_page) description = clean_html(get_element_by_id('edit-description', clean_page)) - thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', - raw_page, u'thumbnail', flags=re.DOTALL) - - return {'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - } + thumbnail = self._search_regex( + r'playlist:.*?url: \'(.+?)\'', + raw_page, 'thumbnail', flags=re.DOTALL) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + } class MITIE(TechTVMITIE): - IE_NAME = u'video.mit.edu' + IE_NAME = 'video.mit.edu' _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)' _TEST = { - u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', - u'file': u'21783.mp4', - u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', - u'info_dict': { - u'title': u'The Government is Profiling You', - u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + 'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + 'file': '.mp4', + 'md5': '7db01d5ccc1895fc5010e9c9e13648da', + 'info_dict': { + 'id': '21783', + 'ext': 'mp4', + 'title': 'The Government is Profiling You', + 'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd', }, } @@ -78,9 +85,8 @@ class MITIE(TechTVMITIE): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) - embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, - u'embed url') + embed_url = self._search_regex( + r'<iframe .*?src="(.+?)"', webpage, 'embed url') return self.url_result(embed_url, ie='TechTVMIT') class OCWMITIE(InfoExtractor): diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index f3356db50..b8d363eb6 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -10,7 +10,7 @@ from ..utils import ( class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' IE_NAME = 'mixcloud' _TEST = { diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index ff750de3f..1a63ab56a 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -6,6 +6,30 @@ from .common import InfoExtractor from ..utils import find_xpath_attr, compat_str +class NBCIE(InfoExtractor): + _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)' + + _TEST = { + 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188', + 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e', + 'info_dict': { + 'id': 'u1RInQZRN7QJ', + 'ext': 'flv', + 'title': 'I Am a Firefighter', + 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url') + if theplatform_url.startswith('//'): + theplatform_url = 'http:' + theplatform_url + return self.url_result(theplatform_url) + + class NBCNewsIE(InfoExtractor): _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)' diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index d60702325..91f2453eb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -13,7 +13,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/ - (?P<config>[^/\?]+/(?:swf|config)/select/)? + (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? |theplatform:)(?P<id>[^/\?&]+)''' _TEST = { @@ -54,10 +54,15 @@ class ThePlatformIE(InfoExtractor): f4m_node = body.find(_x('smil:seq/smil:video')) if f4m_node is not None: + f4m_url = f4m_node.attrib['src'] + if 'manifest.f4m?' not in f4m_url: + f4m_url += '?' + # the parameters are from syfy.com, other sites may use others, + # they also work for nbc.com + f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' formats = [{ 'ext': 'flv', - # the parameters are from syfy.com, other sites may use others - 'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3', + 'url': f4m_url, }] else: base_url = head.find(_x('smil:meta')).attrib['base'] @@ -95,9 +100,10 @@ class ThePlatformIE(InfoExtractor): if mobj.group('config'): config_url = url+ '&form=json' config_url = config_url.replace('swf/', 'config/') + config_url = config_url.replace('onsite/', 'onsite/config/') config_json = self._download_webpage(config_url, video_id, u'Downloading config') config = json.loads(config_json) - smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4' + smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' else: smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?' 'format=smil&mbr=true'.format(video_id)) |