diff options
Diffstat (limited to 'youtube_dl/__init__.py')
-rwxr-xr-x | youtube_dl/__init__.py | 127 |
1 files changed, 127 insertions, 0 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 042b85267..fe0fe987e 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -282,6 +282,14 @@ def _simplify_title(title): expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) return expr.sub(u'_', title).strip(u'_') +def _orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + class DownloadError(Exception): """Download Error exception. @@ -3744,6 +3752,124 @@ class MixcloudIE(InfoExtractor): except UnavailableVideoError, err: self._downloader.trouble(u'ERROR: unable to download file') +class StanfordOpenClassroomIE(InfoExtractor): + """Information extractor for Stanford's Open ClassRoom""" + + _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + IE_NAME = u'stanfordoc' + + def report_download_webpage(self, objid): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + + if mobj.group('course') and mobj.group('video'): # A specific video + course = mobj.group('course') + video = mobj.group('video') + info = { + 'id': _simplify_title(course + '_' + video), + } + + self.report_extraction(info['id']) + baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' + xmlUrl = baseUrl + video + '.xml' + try: + metaXml = urllib2.urlopen(xmlUrl).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err)) + return + mdoc = xml.etree.ElementTree.fromstring(metaXml) + try: + info['title'] = mdoc.findall('./title')[0].text + info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text + except IndexError: + self._downloader.trouble(u'\nERROR: Invalid metadata XML file') + return + info['stitle'] = _simplify_title(info['title']) + info['ext'] = info['url'].rpartition('.')[2] + info['format'] = info['ext'] + self._downloader.increment_downloads() + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download video') + elif mobj.group('course'): # A course page + unescapeHTML = HTMLParser.HTMLParser().unescape + + course = mobj.group('course') + info = { + 'id': _simplify_title(course), + 'type': 'playlist', + } + + self.report_download_webpage(info['id']) + try: + coursepage = urllib2.urlopen(url).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) + return + + m = re.search('<h1>([^<]+)</h1>', coursepage) + if m: + info['title'] = unescapeHTML(m.group(1)) + else: + info['title'] = info['id'] + info['stitle'] = _simplify_title(info['title']) + + m = re.search('<description>([^<]+)</description>', coursepage) + if m: + info['description'] = unescapeHTML(m.group(1)) + + links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) + info['list'] = [ + { + 'type': 'reference', + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), + } + for vpage in links] + + for entry in info['list']: + assert entry['type'] == 'reference' + self.extract(entry['url']) + else: # Root page + unescapeHTML = HTMLParser.HTMLParser().unescape + + info = { + 'id': 'Stanford OpenClassroom', + 'type': 'playlist', + } + + self.report_download_webpage(info['id']) + rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' + try: + rootpage = urllib2.urlopen(rootURL).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) + return + + info['title'] = info['id'] + info['stitle'] = _simplify_title(info['title']) + + links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) + info['list'] = [ + { + 'type': 'reference', + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), + } + for cpage in links] + + for entry in info['list']: + assert entry['type'] == 'reference' + self.extract(entry['url']) class PostProcessor(object): @@ -4166,6 +4292,7 @@ def gen_extractors(): SoundcloudIE(), InfoQIE(), MixcloudIE(), + StanfordOpenClassroomIE(), GenericIE() ] |