diff options
author | Filippo Valsorda - Campagna <filosottile.wiki@gmail.com> | 2012-04-10 18:21:00 +0200 |
---|---|---|
committer | Filippo Valsorda - Campagna <filosottile.wiki@gmail.com> | 2012-04-10 18:21:00 +0200 |
commit | c6f45d431482503e86f72a6f4f63d1090a312b28 (patch) | |
tree | a4e272101eb3e9a6256b92e4e233190ab9fbbad5 /youtube_dl | |
parent | d6a96153471ae7e93693cb4dee46cbec1492af7b (diff) |
removed dependency from lxml: added IDParser
Diffstat (limited to 'youtube_dl')
-rwxr-xr-x | youtube_dl/__init__.py | 100 |
1 files changed, 70 insertions, 30 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 3fd5cadfd..752d762eb 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -68,11 +68,6 @@ except ImportError: from cgi import parse_qs try: - import lxml.etree -except ImportError: - pass # Handled below - -try: import xml.etree.ElementTree except ImportError: # Python<2.5: Not officially supported, but let it slip warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') @@ -198,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') return res + +class IDParser(HTMLParser.HTMLParser): + """Modified HTMLParser that isolates a tag with the specified id""" + def __init__(self, id): + self.id = id + self.result = None + self.started = False + self.depth = {} + self.html = None + self.watch_startpos = False + HTMLParser.HTMLParser.__init__(self) + + def loads(self, html): + self.html = html + self.feed(html) + self.close() + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + if self.started: + self.find_startpos(None) + if 'id' in attrs and attrs['id'] == self.id: + self.result = [tag] + self.started = True + self.watch_startpos = True + if self.started: + if not tag in self.depth: self.depth[tag] = 0 + self.depth[tag] += 1 + + def handle_endtag(self, tag): + if self.started: + if tag in self.depth: self.depth[tag] -= 1 + if self.depth[self.result[0]] == 0: + self.started = False + self.result.append(self.getpos()) + + def find_startpos(self, x): + """Needed to put the start position of the result (self.result[1]) + after the opening tag with the requested id""" + if self.watch_startpos: + self.watch_startpos = False + self.result.append(self.getpos()) + handle_entityref = handle_charref = handle_data = handle_comment = \ + handle_decl = handle_pi = unknown_decl = find_startpos + + def get_result(self): + if self.result == None: return None + if len(self.result) != 3: return None + lines = self.html.split('\n') + lines = lines[self.result[1][0]-1:self.result[2][0]] + lines[0] = lines[0][self.result[1][1]:] + if len(lines) == 1: + lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] + lines[-1] = lines[-1][:self.result[2][1]] + return '\n'.join(lines).strip() + +def get_element_by_id(id, html): + """Return the content of the tag with the specified id in the passed HTML document""" + parser = IDParser(id) + parser.loads(html) + return parser.get_result() + + def preferredencoding(): """Get preferred encoding. @@ -246,7 +304,7 @@ def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs <br /> html = html.replace('\n', ' ') - html = re.sub('<\s*br\s*/?\s*>', '\n', html) + html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -1432,18 +1490,9 @@ class YoutubeIE(InfoExtractor): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' # closed captions video_subtitles = None @@ -2177,18 +2226,9 @@ class VimeoIE(InfoExtractor): video_thumbnail = config["video"]["thumbnail"] # Extract video description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) - if mobj is not None: - video_description = mobj.group(1) - else: - html_parser = lxml.etree.HTMLParser() - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() - # TODO use another parser + video_description = get_element_by_id("description", webpage) + if video_description: video_description = clean_html(video_description.decode('utf8')) + else: video_description = '' # Extract upload date video_upload_date = u'NA' |