aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFilippo Valsorda <filippo.valsorda@gmail.com>2013-06-06 14:35:08 +0200
committerFilippo Valsorda <filippo.valsorda@gmail.com>2013-06-06 14:35:08 +0200
commit468e2e926b8d1f55d6ce67fee67e33a7fa6d8371 (patch)
tree0a295849b44b3eb871f2c66ca1aa30126cb9f1f4
parentac3e9394e76c0e8baeff1bc77eb67fa184ceb81c (diff)
downloadyoutube-dl-468e2e926b8d1f55d6ce67fee67e33a7fa6d8371.tar.xz
implement fallbacks and defaults in _search_regex
-rwxr-xr-xyoutube_dl/InfoExtractors.py84
-rw-r--r--youtube_dl/utils.py3
2 files changed, 47 insertions, 40 deletions
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 4d13c17e4..fbf40f3ca 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -191,19 +191,37 @@ class InfoExtractor(object):
video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, text, name, fatal=True, flags=0):
- """Extract a field from some text based on regex"""
- mobj = re.search(pattern, text, flags)
- if mobj is None and fatal:
+ def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ """
+ Perform a regex search on the given string, using a single or a list of
+ patterns returning the first matching group.
+ In case of failure return a default value or raise a WARNING or a
+ ExtractorError, depending on fatal, specifying the field name.
+ """
+ if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+ mobj = re.search(pattern, string, flags)
+ else:
+ for p in pattern:
+ mobj = re.search(p, string, flags)
+ if mobj: break
+
+ if sys.stderr.isatty() and os.name != 'nt':
+ _name = u'\033[0;34m%s\033[0m' % name
+ else:
+ _name = name
+
+ if mobj:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ elif default is not None:
+ return default
+ elif fatal:
raise ExtractorError(u'Unable to extract %s; '
- u'please report this issue on GitHub.' % name)
- elif mobj is None:
+ u'please report this issue on GitHub.' % _name)
+ else:
self._downloader.report_warning(u'unable to extract %s; '
- u'please report this issue on GitHub.' % name)
+ u'please report this issue on GitHub.' % _name)
return None
- else:
- # return the first matched group
- return next(g for g in mobj.groups() if g is not None)
class SearchInfoExtractor(InfoExtractor):
"""
@@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor):
note='Downloading course info page',
errnote='Unable to download course info page')
- # TODO: implement default_value in search_regex
- m = re.search('<h1>([^<]+)</h1>', coursepage)
- if m:
- info['title'] = unescapeHTML(m.group(1))
- else:
- info['title'] = info['id']
+ info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
+ info['title'] = unescapeHTML(info['title'])
info['description'] = self._search_regex('<description>([^<]+)</description>',
coursepage, u'description', fatal=False)
@@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor):
# Extract title
# Get the first line for title
- # TODO: implement default_value in search_regex
- video_title = u'NA'
- pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
- mobj = re.search(pattern, webpage)
- if mobj:
- video_title = mobj.group(1)
+ video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+ webpage, 'title', default=u'NA')
# Step 2, Stimulate clicking the image box to launch video
video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
@@ -3167,23 +3177,21 @@ class NBAIE(InfoExtractor):
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
- # TODO: implement default_value in search_regex
- def _findProp(rexp, default=None):
- m = re.search(rexp, webpage)
- if m:
- return unescapeHTML(m.group(1))
- else:
- return default
-
shortened_video_id = video_id.rpartition('/')[2]
- title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+ title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+ webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+
+ uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+
+ description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
+
info = {
'id': shortened_video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
- 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
- 'description': _findProp(r'<div class="description">(.*?)</h1>'),
+ 'uploader_date': uploader_date,
+ 'description': description,
}
return [info]
@@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor):
webpage, u'video URL', flags=re.DOTALL)
video_url = unescapeHTML(video_url)
- # TODO: implement fallbacks in regex_search
- m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
- if not m:
- m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
- if not m:
- raise ExtractorError(u'Cannot find video title')
- title = clean_html(m.group('title'))
+ title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+ r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
+ title = clean_html(title)
video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
webpage, u'description', flags=re.DOTALL)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 63d9d0ae5..3a8dcf4d3 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -154,6 +154,9 @@ def compat_ord(c):
if type(c) is int: return c
else: return ord(c)
+# This is not clearly defined otherwise
+compiled_regex_type = type(re.compile(''))
+
std_headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',