aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFilippo Valsorda - Campagna <filosottile.wiki@gmail.com>2012-04-10 16:31:46 +0200
committerFilippo Valsorda - Campagna <filosottile.wiki@gmail.com>2012-04-10 16:31:46 +0200
commitd6a96153471ae7e93693cb4dee46cbec1492af7b (patch)
treec505b750b65355f4903ad425b2a4ae46ab4903b1
parentceba827e9aab563ae7c7190fc236ec1aa358ee59 (diff)
downloadyoutube-dl-d6a96153471ae7e93693cb4dee46cbec1492af7b.tar.xz
standardized the use of unescapeHTML; added clean_html()
-rwxr-xr-xyoutube_dl/__init__.py32
1 files changed, 15 insertions, 17 deletions
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 5f874b72f..3fd5cadfd 100755
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -242,6 +242,18 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)
+def clean_html(html):
+ """Clean an HTML snippet into a readable string"""
+ # Newline vs <br />
+ html = html.replace('\n', ' ')
+ html = re.sub('<\s*br\s*/?\s*>', '\n', html)
+ # Strip html tags
+ html = re.sub('<.*?>', '', html)
+ # Replace html entities
+ html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
+ return html
+
+
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
@@ -3343,8 +3355,6 @@ class EscapistIE(InfoExtractor):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3360,11 +3370,11 @@ class EscapistIE(InfoExtractor):
return
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
- description = htmlParser.unescape(descMatch.group(1))
+ description = unescapeHTML(descMatch.group(1))
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
- imgUrl = htmlParser.unescape(imgMatch.group(1))
+ imgUrl = unescapeHTML(imgMatch.group(1))
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
- playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
+ playerUrl = unescapeHTML(playerUrlMatch.group(1))
configUrlMatch = re.search('config=(.*)$', playerUrl)
configUrl = urllib2.unquote(configUrlMatch.group(1))
@@ -3423,8 +3433,6 @@ class CollegeHumorIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3495,8 +3503,6 @@ class XVideosIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3585,8 +3591,6 @@ class SoundcloudIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3674,8 +3678,6 @@ class InfoQIE(InfoExtractor):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
- htmlParser = HTMLParser.HTMLParser()
-
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3909,8 +3911,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video')
elif mobj.group('course'): # A course page
- unescapeHTML = HTMLParser.HTMLParser().unescape
-
course = mobj.group('course')
info = {
'id': _simplify_title(course),
@@ -3947,8 +3947,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
assert entry['type'] == 'reference'
self.extract(entry['url'])
else: # Root page
- unescapeHTML = HTMLParser.HTMLParser().unescape
-
info = {
'id': 'Stanford OpenClassroom',
'type': 'playlist',