diff options
| -rwxr-xr-x | youtube-dl | 131 | ||||
| -rwxr-xr-x | youtube_dl/__init__.py | 100 | 
2 files changed, 155 insertions, 76 deletions
diff --git a/youtube-dl b/youtube-dl index 5224611d2..752d762eb 100755 --- a/youtube-dl +++ b/youtube-dl @@ -15,6 +15,7 @@ __authors__  = (  	'Kevin Ngo',  	'Ori Avtalion',  	'shizeeg', +	'Filippo Valsorda',  	)  __license__ = 'Public Domain' @@ -67,11 +68,6 @@ except ImportError:  	from cgi import parse_qs  try: -	import lxml.etree -except ImportError: -	pass # Handled below - -try:  	import xml.etree.ElementTree  except ImportError: # Python<2.5: Not officially supported, but let it slip  	warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') @@ -197,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr  				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')  			return res + +class IDParser(HTMLParser.HTMLParser): +	"""Modified HTMLParser that isolates a tag with the specified id""" +	def __init__(self, id): +		self.id = id +		self.result = None +		self.started = False +		self.depth = {} +		self.html = None +		self.watch_startpos = False +		HTMLParser.HTMLParser.__init__(self) + +	def loads(self, html): +		self.html = html +		self.feed(html) +		self.close() + +	def handle_starttag(self, tag, attrs): +		attrs = dict(attrs) +		if self.started: +			self.find_startpos(None) +		if 'id' in attrs and attrs['id'] == self.id: +			self.result = [tag] +			self.started = True +			self.watch_startpos = True +		if self.started: +			if not tag in self.depth: self.depth[tag] = 0 +			self.depth[tag] += 1 + +	def handle_endtag(self, tag): +		if self.started: +			if tag in self.depth: self.depth[tag] -= 1 +			if self.depth[self.result[0]] == 0: +				self.started = False +				self.result.append(self.getpos()) + +	def find_startpos(self, x): +		"""Needed to put the start position of the result (self.result[1]) +		after the opening tag with the requested id""" +		if self.watch_startpos: +			self.watch_startpos = False +			self.result.append(self.getpos()) +	handle_entityref = handle_charref = handle_data = handle_comment = \ +	handle_decl = handle_pi = unknown_decl = find_startpos + +	def get_result(self): +		if self.result == None: return None +		if len(self.result) != 3: return None +		lines = self.html.split('\n') +		lines = lines[self.result[1][0]-1:self.result[2][0]] +		lines[0] = lines[0][self.result[1][1]:] +		if len(lines) == 1: +			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] +		lines[-1] = lines[-1][:self.result[2][1]] +		return '\n'.join(lines).strip() + +def get_element_by_id(id, html): +	"""Return the content of the tag with the specified id in the passed HTML document""" +	parser = IDParser(id) +	parser.loads(html) +	return parser.get_result() + +  def preferredencoding():  	"""Get preferred encoding. @@ -241,6 +300,18 @@ def htmlentity_transform(matchobj):  	return (u'&%s;' % entity) +def clean_html(html): +	"""Clean an HTML snippet into a readable string""" +	# Newline vs <br /> +	html = html.replace('\n', ' ') +	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) +	# Strip html tags +	html = re.sub('<.*?>', '', html) +	# Replace html entities +	html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) +	return html + +  def sanitize_title(utitle):  	"""Sanitizes a video title so it could be used as part of a filename."""  	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) @@ -1419,18 +1490,9 @@ class YoutubeIE(InfoExtractor):  					pass  		# description -		try: -			lxml.etree -		except NameError: -			video_description = u'No description available.' -			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) -			if mobj is not None: -				video_description = mobj.group(1).decode('utf-8') -		else: -			html_parser = lxml.etree.HTMLParser(encoding='utf-8') -			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) -			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) -			# TODO use another parser +		video_description = get_element_by_id("eow-description", video_webpage) +		if video_description: video_description = clean_html(video_description.decode('utf8')) +		else: video_description = ''  		# closed captions  		video_subtitles = None @@ -2164,18 +2226,9 @@ class VimeoIE(InfoExtractor):  		video_thumbnail = config["video"]["thumbnail"]  		# Extract video description -		try: -			lxml.etree -		except NameError: -			video_description = u'No description available.' -			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) -			if mobj is not None: -				video_description = mobj.group(1) -		else: -			html_parser = lxml.etree.HTMLParser() -			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) -			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() -			# TODO use another parser +		video_description = get_element_by_id("description", webpage) +		if video_description: video_description = clean_html(video_description.decode('utf8')) +		else: video_description = ''  		# Extract upload date  		video_upload_date = u'NA' @@ -3342,8 +3395,6 @@ class EscapistIE(InfoExtractor):  		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)  	def _real_extract(self, url): -		htmlParser = HTMLParser.HTMLParser() -  		mobj = re.match(self._VALID_URL, url)  		if mobj is None:  			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3359,11 +3410,11 @@ class EscapistIE(InfoExtractor):  			return  		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage) -		description = htmlParser.unescape(descMatch.group(1)) +		description = unescapeHTML(descMatch.group(1))  		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage) -		imgUrl = htmlParser.unescape(imgMatch.group(1)) +		imgUrl = unescapeHTML(imgMatch.group(1))  		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage) -		playerUrl = htmlParser.unescape(playerUrlMatch.group(1)) +		playerUrl = unescapeHTML(playerUrlMatch.group(1))  		configUrlMatch = re.search('config=(.*)$', playerUrl)  		configUrl = urllib2.unquote(configUrlMatch.group(1)) @@ -3422,8 +3473,6 @@ class CollegeHumorIE(InfoExtractor):  		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))  	def _real_extract(self, url): -		htmlParser = HTMLParser.HTMLParser() -  		mobj = re.match(self._VALID_URL, url)  		if mobj is None:  			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3494,8 +3543,6 @@ class XVideosIE(InfoExtractor):  		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))  	def _real_extract(self, url): -		htmlParser = HTMLParser.HTMLParser() -  		mobj = re.match(self._VALID_URL, url)  		if mobj is None:  			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3584,8 +3631,6 @@ class SoundcloudIE(InfoExtractor):  		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))  	def _real_extract(self, url): -		htmlParser = HTMLParser.HTMLParser() -  		mobj = re.match(self._VALID_URL, url)  		if mobj is None:  			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3673,8 +3718,6 @@ class InfoQIE(InfoExtractor):  		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))  	def _real_extract(self, url): -		htmlParser = HTMLParser.HTMLParser() -  		mobj = re.match(self._VALID_URL, url)  		if mobj is None:  			self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -3908,8 +3951,6 @@ class StanfordOpenClassroomIE(InfoExtractor):  			except UnavailableVideoError, err:  				self._downloader.trouble(u'\nERROR: unable to download video')  		elif mobj.group('course'): # A course page -			unescapeHTML = HTMLParser.HTMLParser().unescape -  			course = mobj.group('course')  			info = {  				'id': _simplify_title(course), @@ -3946,8 +3987,6 @@ class StanfordOpenClassroomIE(InfoExtractor):  				assert entry['type'] == 'reference'  				self.extract(entry['url'])  		else: # Root page -			unescapeHTML = HTMLParser.HTMLParser().unescape -  			info = {  				'id': 'Stanford OpenClassroom',  				'type': 'playlist', diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3fd5cadfd..752d762eb 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -68,11 +68,6 @@ except ImportError:  	from cgi import parse_qs  try: -	import lxml.etree -except ImportError: -	pass # Handled below - -try:  	import xml.etree.ElementTree  except ImportError: # Python<2.5: Not officially supported, but let it slip  	warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') @@ -198,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr  				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')  			return res + +class IDParser(HTMLParser.HTMLParser): +	"""Modified HTMLParser that isolates a tag with the specified id""" +	def __init__(self, id): +		self.id = id +		self.result = None +		self.started = False +		self.depth = {} +		self.html = None +		self.watch_startpos = False +		HTMLParser.HTMLParser.__init__(self) + +	def loads(self, html): +		self.html = html +		self.feed(html) +		self.close() + +	def handle_starttag(self, tag, attrs): +		attrs = dict(attrs) +		if self.started: +			self.find_startpos(None) +		if 'id' in attrs and attrs['id'] == self.id: +			self.result = [tag] +			self.started = True +			self.watch_startpos = True +		if self.started: +			if not tag in self.depth: self.depth[tag] = 0 +			self.depth[tag] += 1 + +	def handle_endtag(self, tag): +		if self.started: +			if tag in self.depth: self.depth[tag] -= 1 +			if self.depth[self.result[0]] == 0: +				self.started = False +				self.result.append(self.getpos()) + +	def find_startpos(self, x): +		"""Needed to put the start position of the result (self.result[1]) +		after the opening tag with the requested id""" +		if self.watch_startpos: +			self.watch_startpos = False +			self.result.append(self.getpos()) +	handle_entityref = handle_charref = handle_data = handle_comment = \ +	handle_decl = handle_pi = unknown_decl = find_startpos + +	def get_result(self): +		if self.result == None: return None +		if len(self.result) != 3: return None +		lines = self.html.split('\n') +		lines = lines[self.result[1][0]-1:self.result[2][0]] +		lines[0] = lines[0][self.result[1][1]:] +		if len(lines) == 1: +			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] +		lines[-1] = lines[-1][:self.result[2][1]] +		return '\n'.join(lines).strip() + +def get_element_by_id(id, html): +	"""Return the content of the tag with the specified id in the passed HTML document""" +	parser = IDParser(id) +	parser.loads(html) +	return parser.get_result() + +  def preferredencoding():  	"""Get preferred encoding. @@ -246,7 +304,7 @@ def clean_html(html):  	"""Clean an HTML snippet into a readable string"""  	# Newline vs <br />  	html = html.replace('\n', ' ') -	html = re.sub('<\s*br\s*/?\s*>', '\n', html) +	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)  	# Strip html tags  	html = re.sub('<.*?>', '', html)  	# Replace html entities @@ -1432,18 +1490,9 @@ class YoutubeIE(InfoExtractor):  					pass  		# description -		try: -			lxml.etree -		except NameError: -			video_description = u'No description available.' -			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) -			if mobj is not None: -				video_description = mobj.group(1).decode('utf-8') -		else: -			html_parser = lxml.etree.HTMLParser(encoding='utf-8') -			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) -			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) -			# TODO use another parser +		video_description = get_element_by_id("eow-description", video_webpage) +		if video_description: video_description = clean_html(video_description.decode('utf8')) +		else: video_description = ''  		# closed captions  		video_subtitles = None @@ -2177,18 +2226,9 @@ class VimeoIE(InfoExtractor):  		video_thumbnail = config["video"]["thumbnail"]  		# Extract video description -		try: -			lxml.etree -		except NameError: -			video_description = u'No description available.' -			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) -			if mobj is not None: -				video_description = mobj.group(1) -		else: -			html_parser = lxml.etree.HTMLParser() -			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) -			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() -			# TODO use another parser +		video_description = get_element_by_id("description", webpage) +		if video_description: video_description = clean_html(video_description.decode('utf8')) +		else: video_description = ''  		# Extract upload date  		video_upload_date = u'NA'  | 
