aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/utils.py
diff options
context:
space:
mode:
authorFilippo Valsorda <filippo.valsorda@gmail.com>2012-04-11 00:22:51 +0200
committerFilippo Valsorda <filippo.valsorda@gmail.com>2012-04-11 00:22:51 +0200
commit9e6dd238761e6628d7bd0b6bdb7a997604de3757 (patch)
treefdc8e4a1aaa0624f42b9f8325715e9957b88ae6e /youtube_dl/utils.py
parentd11d05d07acdd11a93b02d750852dea4ae32be3b (diff)
parent7a8501e307ec1283aeacb03b471b5509b8c92854 (diff)
merged unescapeHTML branch; removed lxml dependency
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r--youtube_dl/utils.py83
1 files changed, 80 insertions, 3 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 737cca8e1..a19656000 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -74,9 +74,86 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)
+class IDParser(HTMLParser.HTMLParser):
+ """Modified HTMLParser that isolates a tag with the specified id"""
+ def __init__(self, id):
+ self.id = id
+ self.result = None
+ self.started = False
+ self.depth = {}
+ self.html = None
+ self.watch_startpos = False
+ HTMLParser.HTMLParser.__init__(self)
+
+ def loads(self, html):
+ self.html = html
+ self.feed(html)
+ self.close()
+
+ def handle_starttag(self, tag, attrs):
+ attrs = dict(attrs)
+ if self.started:
+ self.find_startpos(None)
+ if 'id' in attrs and attrs['id'] == self.id:
+ self.result = [tag]
+ self.started = True
+ self.watch_startpos = True
+ if self.started:
+ if not tag in self.depth: self.depth[tag] = 0
+ self.depth[tag] += 1
+
+ def handle_endtag(self, tag):
+ if self.started:
+ if tag in self.depth: self.depth[tag] -= 1
+ if self.depth[self.result[0]] == 0:
+ self.started = False
+ self.result.append(self.getpos())
+
+ def find_startpos(self, x):
+ """Needed to put the start position of the result (self.result[1])
+ after the opening tag with the requested id"""
+ if self.watch_startpos:
+ self.watch_startpos = False
+ self.result.append(self.getpos())
+ handle_entityref = handle_charref = handle_data = handle_comment = \
+ handle_decl = handle_pi = unknown_decl = find_startpos
+
+ def get_result(self):
+ if self.result == None: return None
+ if len(self.result) != 3: return None
+ lines = self.html.split('\n')
+ lines = lines[self.result[1][0]-1:self.result[2][0]]
+ lines[0] = lines[0][self.result[1][1]:]
+ if len(lines) == 1:
+ lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+ lines[-1] = lines[-1][:self.result[2][1]]
+ return '\n'.join(lines).strip()
+
+def get_element_by_id(id, html):
+ """Return the content of the tag with the specified id in the passed HTML document"""
+ parser = IDParser(id)
+ try:
+ parser.loads(html)
+ except HTMLParser.HTMLParseError:
+ pass
+ return parser.get_result()
+
+
+def clean_html(html):
+ """Clean an HTML snippet into a readable string"""
+ # Newline vs <br />
+ html = html.replace('\n', ' ')
+ html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ # Strip html tags
+ html = re.sub('<.*?>', '', html)
+ # Replace html entities
+ html = unescapeHTML(html)
+ return html
+
+
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
- utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
+ utitle = unescapeHTML(utitle)
return utitle.replace(unicode(os.sep), u'%')
@@ -133,8 +210,8 @@ def unescapeHTML(s):
"""
assert type(s) == type(u'')
- htmlParser = HTMLParser.HTMLParser()
- return htmlParser.unescape(s)
+ result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
+ return result
def encodeFilename(s):
"""