diff options
| -rwxr-xr-x | youtube_dl/__init__.py | 127 | 
1 files changed, 127 insertions, 0 deletions
| diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 042b85267..fe0fe987e 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -282,6 +282,14 @@ def _simplify_title(title):  	expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)  	return expr.sub(u'_', title).strip(u'_') +def _orderedSet(iterable): +	""" Remove all duplicates from the input iterable """ +	res = [] +	for el in iterable: +		if el not in res: +			res.append(el) +	return res +  class DownloadError(Exception):  	"""Download Error exception. @@ -3744,6 +3752,124 @@ class MixcloudIE(InfoExtractor):  		except UnavailableVideoError, err:  			self._downloader.trouble(u'ERROR: unable to download file') +class StanfordOpenClassroomIE(InfoExtractor): +	"""Information extractor for Stanford's Open ClassRoom""" + +	_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' +	IE_NAME = u'stanfordoc' + +	def report_download_webpage(self, objid): +		"""Report information extraction.""" +		self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid)) + +	def report_extraction(self, video_id): +		"""Report information extraction.""" +		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + +	def _real_extract(self, url): +		mobj = re.match(self._VALID_URL, url) +		if mobj is None: +			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) +			return + +		if mobj.group('course') and mobj.group('video'): # A specific video +			course = mobj.group('course') +			video = mobj.group('video') +			info = { +				'id': _simplify_title(course + '_' + video), +			} +	 +			self.report_extraction(info['id']) +			baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' +			xmlUrl = baseUrl + video + '.xml' +			try: +				metaXml = urllib2.urlopen(xmlUrl).read() +			except (urllib2.URLError, httplib.HTTPException, socket.error), err: +				self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err)) +				return +			mdoc = xml.etree.ElementTree.fromstring(metaXml) +			try: +				info['title'] = mdoc.findall('./title')[0].text +				info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text +			except IndexError: +				self._downloader.trouble(u'\nERROR: Invalid metadata XML file') +				return +			info['stitle'] = _simplify_title(info['title']) +			info['ext'] = info['url'].rpartition('.')[2] +			info['format'] = info['ext'] +			self._downloader.increment_downloads() +			try: +				self._downloader.process_info(info) +			except UnavailableVideoError, err: +				self._downloader.trouble(u'\nERROR: unable to download video') +		elif mobj.group('course'): # A course page +			unescapeHTML = HTMLParser.HTMLParser().unescape + +			course = mobj.group('course') +			info = { +				'id': _simplify_title(course), +				'type': 'playlist', +			} + +			self.report_download_webpage(info['id']) +			try: +				coursepage = urllib2.urlopen(url).read() +			except (urllib2.URLError, httplib.HTTPException, socket.error), err: +				self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) +				return + +			m = re.search('<h1>([^<]+)</h1>', coursepage) +			if m: +				info['title'] = unescapeHTML(m.group(1)) +			else: +				info['title'] = info['id'] +			info['stitle'] = _simplify_title(info['title']) + +			m = re.search('<description>([^<]+)</description>', coursepage) +			if m: +				info['description'] = unescapeHTML(m.group(1)) + +			links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) +			info['list'] = [ +				{ +					'type': 'reference', +					'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), +				} +					for vpage in links] + +			for entry in info['list']: +				assert entry['type'] == 'reference' +				self.extract(entry['url']) +		else: # Root page +			unescapeHTML = HTMLParser.HTMLParser().unescape + +			info = { +				'id': 'Stanford OpenClassroom', +				'type': 'playlist', +			} + +			self.report_download_webpage(info['id']) +			rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' +			try: +				rootpage = urllib2.urlopen(rootURL).read() +			except (urllib2.URLError, httplib.HTTPException, socket.error), err: +				self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) +				return + +			info['title'] = info['id'] +			info['stitle'] = _simplify_title(info['title']) + +			links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) +			info['list'] = [ +				{ +					'type': 'reference', +					'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), +				} +					for cpage in links] + +			for entry in info['list']: +				assert entry['type'] == 'reference' +				self.extract(entry['url'])  class PostProcessor(object): @@ -4166,6 +4292,7 @@ def gen_extractors():  		SoundcloudIE(),  		InfoQIE(),  		MixcloudIE(), +		StanfordOpenClassroomIE(),  		GenericIE()  	] | 
