diff options
-rwxr-xr-x | youtube_dl/__init__.py | 102 |
1 files changed, 78 insertions, 24 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 96e2f0f89..fe0fe987e 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -282,6 +282,14 @@ def _simplify_title(title): expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) return expr.sub(u'_', title).strip(u'_') +def _orderedSet(iterable): + """ Remove all duplicates from the input iterable """ + res = [] + for el in iterable: + if el not in res: + res.append(el) + return res + class DownloadError(Exception): """Download Error exception. @@ -711,25 +719,6 @@ class FileDownloader(object): return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' return None - def process_dict(self, info_dict): - """ Download and handle the extracted information. - For details on the specification of the various types of content, refer to the _process_* functions. """ - if info_dict['type'] == 'playlist': - self._process_playlist(info_dict) - elif info_dict['type'] == 'legacy-video': - self.process_info(info_dict) - else: - raise ValueError('Invalid item type') - - def _process_playlist(self, info_dict): - assert info_dict['type'] == 'playlist' - assert 'title' in info_dict - assert 'stitle' in info_dict - entries = info_dict['list'] - - for e in entries: - self.process_dict(e) - def process_info(self, info_dict): """Process a single dictionary returned by an InfoExtractor.""" @@ -3766,9 +3755,13 @@ class MixcloudIE(InfoExtractor): class StanfordOpenClassroomIE(InfoExtractor): """Information extractor for Stanford's Open ClassRoom""" - _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' + _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' IE_NAME = u'stanfordoc' + def report_download_webpage(self, objid): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid)) + def report_extraction(self, video_id): """Report information extraction.""" self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) @@ -3792,7 +3785,7 @@ class StanfordOpenClassroomIE(InfoExtractor): try: metaXml = urllib2.urlopen(xmlUrl).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err)) + self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err)) return mdoc = xml.etree.ElementTree.fromstring(metaXml) try: @@ -3809,13 +3802,74 @@ class StanfordOpenClassroomIE(InfoExtractor): self._downloader.process_info(info) except UnavailableVideoError, err: self._downloader.trouble(u'\nERROR: unable to download video') - else: - print('TODO: Not yet implemented') - 1/0 + elif mobj.group('course'): # A course page + unescapeHTML = HTMLParser.HTMLParser().unescape + course = mobj.group('course') + info = { + 'id': _simplify_title(course), + 'type': 'playlist', + } + self.report_download_webpage(info['id']) + try: + coursepage = urllib2.urlopen(url).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) + return + m = re.search('<h1>([^<]+)</h1>', coursepage) + if m: + info['title'] = unescapeHTML(m.group(1)) + else: + info['title'] = info['id'] + info['stitle'] = _simplify_title(info['title']) + + m = re.search('<description>([^<]+)</description>', coursepage) + if m: + info['description'] = unescapeHTML(m.group(1)) + + links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) + info['list'] = [ + { + 'type': 'reference', + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), + } + for vpage in links] + + for entry in info['list']: + assert entry['type'] == 'reference' + self.extract(entry['url']) + else: # Root page + unescapeHTML = HTMLParser.HTMLParser().unescape + + info = { + 'id': 'Stanford OpenClassroom', + 'type': 'playlist', + } + + self.report_download_webpage(info['id']) + rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' + try: + rootpage = urllib2.urlopen(rootURL).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) + return + + info['title'] = info['id'] + info['stitle'] = _simplify_title(info['title']) + + links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) + info['list'] = [ + { + 'type': 'reference', + 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), + } + for cpage in links] + for entry in info['list']: + assert entry['type'] == 'reference' + self.extract(entry['url']) class PostProcessor(object): |