From 67b22dd03686d9e360d87a7751de74b321d3f231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 28 Aug 2013 12:51:22 +0200 Subject: Add extractors for video.mit.edu and techtv.mit.edu (closes #1327) video.mit.edu just embeds the videos from techtv.mit.edu --- youtube_dl/extractor/mit.py | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 youtube_dl/extractor/mit.py (limited to 'youtube_dl/extractor/mit.py') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py new file mode 100644 index 000000000..d09d03e36 --- /dev/null +++ b/youtube_dl/extractor/mit.py @@ -0,0 +1,76 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_id, +) + + +class TechTVMITIE(InfoExtractor): + IE_NAME = u'techtv.mit.edu' + _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P\d+)' + + _TEST = { + u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set', + u'file': u'25418.mp4', + u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f', + u'info_dict': { + u'title': u'MIT DNA Learning Center Set', + u'description': u'md5:82313335e8a8a3f243351ba55bc1b474', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage( + 'http://techtv.mit.edu/videos/%s' % video_id, video_id) + embed_page = self._download_webpage( + 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, + note=u'Downloading embed page') + + base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', + embed_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + u'video formats') + formats = json.loads(formats_json) + formats = sorted(formats, key=lambda f: f['bitrate']) + + title = get_element_by_id('edit-title', webpage) + description = clean_html(get_element_by_id('edit-description', webpage)) + thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', + embed_page, u'thumbnail', flags=re.DOTALL) + + return {'id': video_id, + 'title': title, + 'url': base_url + formats[-1]['url'].replace('mp4:', ''), + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumbnail, + } + + +class MITIE(TechTVMITIE): + IE_NAME = u'video.mit.edu' + _VALID_URL = r'https?://video\.mit\.edu/watch/(?P[^/]+)' + + _TEST = { + u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/', + u'file': u'21783.mp4', + u'md5': u'7db01d5ccc1895fc5010e9c9e13648da', + u'info_dict': { + u'title': u'The Government is Profiling You', + u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + page_title = mobj.group('title') + webpage = self._download_webpage(url, page_title) + self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME)) + embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage, + u'embed url') + return self.url_result(embed_url, ie='TechTVMIT') -- cgit v1.2.3 From b5ba7b9dcfed5ded96c841a0ebbbf12132de838f Mon Sep 17 00:00:00 2001 From: Jeff Smith <whydoubt@yahoo.com> Date: Wed, 28 Aug 2013 14:00:59 -0500 Subject: Fix MIT extractor for Python 2.6 The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing. --- youtube_dl/extractor/mit.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'youtube_dl/extractor/mit.py') diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index d09d03e36..52be9232f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage( + raw_page = self._download_webpage( 'http://techtv.mit.edu/videos/%s' % video_id, video_id) - embed_page = self._download_webpage( - 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id, - note=u'Downloading embed page') + clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page) base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)', - embed_page, u'base url') - formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page, + raw_page, u'base url') + formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page, u'video formats') formats = json.loads(formats_json) formats = sorted(formats, key=lambda f: f['bitrate']) - title = get_element_by_id('edit-title', webpage) - description = clean_html(get_element_by_id('edit-description', webpage)) + title = get_element_by_id('edit-title', clean_page) + description = clean_html(get_element_by_id('edit-description', clean_page)) thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'', - embed_page, u'thumbnail', flags=re.DOTALL) + raw_page, u'thumbnail', flags=re.DOTALL) return {'id': video_id, 'title': title, -- cgit v1.2.3