From ac3e9394e76c0e8baeff1bc77eb67fa184ceb81c Mon Sep 17 00:00:00 2001
From: Anna Bernardi <anna.bernardi.9@gmail.com>
Date: Thu, 6 Jun 2013 13:27:27 +0200
Subject: Implement search_regex from #847

---
 youtube_dl/InfoExtractors.py | 635 +++++++++++++++++--------------------------
 1 file changed, 253 insertions(+), 382 deletions(-)

(limited to 'youtube_dl')
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index b40edf5fb..4d13c17e4 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -191,6 +191,20 @@ class InfoExtractor(object):
             video_info['title'] = playlist_title
         return video_info
 
+    def _search_regex(self, pattern, text, name, fatal=True, flags=0):
+        """Extract a field from some text based on regex"""
+        mobj = re.search(pattern, text, flags)
+        if mobj is None and fatal:
+            raise ExtractorError(u'Unable to extract %s; '
+                u'please report this issue on GitHub.' % name)
+        elif mobj is None:
+            self._downloader.report_warning(u'unable to extract %s; '
+                u'please report this issue on GitHub.' % name)
+            return None
+        else:
+            # return the first matched group
+            return next(g for g in mobj.groups() if g is not None)
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
@@ -964,18 +978,13 @@ class PhotobucketIE(InfoExtractor):
             }]
 
         # We try looking in other parts of the webpage
-        mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        mediaURL = compat_urllib_parse.unquote(mobj.group(1))
-
-        video_url = mediaURL
+        video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
+            webpage, u'video URL')
 
         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
         if mobj is None:
             raise ExtractorError(u'Unable to extract title')
         video_title = mobj.group(1).decode('utf-8')
-
         video_uploader = mobj.group(2).decode('utf-8')
 
         return [{
@@ -1803,10 +1812,7 @@ class DepositFilesIE(InfoExtractor):
         file_extension = os.path.splitext(file_url)[1][1:]
 
         # Search for file title
-        mobj = re.search(r'<b title="(.*?)">', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        file_title = mobj.group(1).decode('utf-8')
+        file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 
         return [{
             'id':       file_id.decode('utf-8'),
@@ -1900,10 +1906,9 @@ class FacebookIE(InfoExtractor):
         video_duration = int(video_data['video_duration'])
         thumbnail = video_data['thumbnail_src']
 
-        m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find title in webpage')
-        video_title = unescapeHTML(m.group(1))
+        video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
         info = {
             'id': video_id,
@@ -2065,15 +2070,10 @@ class MyVideoIE(InfoExtractor):
             self.report_extraction(video_id)
             video_url = mobj.group(1) + '.flv'
 
-            mobj = re.search('<title>([^<]+)</title>', webpage)
-            if mobj is None:
-                raise ExtractorError(u'Unable to extract title')
-            video_title = mobj.group(1)
+            video_title = self._search_regex('<title>([^<]+)</title>',
+                webpage, u'title')
 
-            mobj = re.search('[.](.+?)$', video_url)
-            if mobj is None:
-                raise ExtractorError(u'Unable to extract extention')
-            video_ext = mobj.group(1)
+            video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 
             return [{
                 'id':       video_id,
@@ -2121,25 +2121,23 @@ class MyVideoIE(InfoExtractor):
         # extracting infos
         self.report_extraction(video_id)
 
+        video_url = None
         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract rtmpurl')
-        video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
-        if 'myvideo2flash' in video_rtmpurl:
-            self._downloader.report_warning(u'forcing RTMPT ...')
-            video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
-
-        # extract non rtmp videos
-        if (video_rtmpurl is None) or (video_rtmpurl == ''):
+        if mobj:
+            video_url = compat_urllib_parse.unquote(mobj.group(1))
+            if 'myvideo2flash' in video_url:
+                self._downloader.report_warning(u'forcing RTMPT ...')
+                video_url = video_url.replace('rtmpe://', 'rtmpt://')
+
+        if not video_url:
+            # extract non rtmp videos
             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
             if mobj is None:
                 raise ExtractorError(u'unable to extract url')
-            video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+            video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 
-        mobj = re.search('source=\'(.*?)\'', dec_data)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract swfobj')
-        video_file     = compat_urllib_parse.unquote(mobj.group(1))
+        video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
+        video_file = compat_urllib_parse.unquote(video_file)
 
         if not video_file.endswith('f4m'):
             ppath, prefix = video_file.split('.')
@@ -2151,20 +2149,16 @@ class MyVideoIE(InfoExtractor):
                 video_filepath + video_file
             ).replace('.f4m', '.m3u8')
 
-        mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract swfobj')
-        video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
+        video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
+        video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 
-        mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
-        if mobj is None:
-            raise ExtractorError(u'unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+            webpage, u'title')
 
         return [{
             'id':                 video_id,
-            'url':                video_rtmpurl,
-            'tc_url':             video_rtmpurl,
+            'url':                video_url,
+            'tc_url':             video_url,
             'uploader':           None,
             'upload_date':        None,
             'title':              video_title,
@@ -2175,6 +2169,7 @@ class MyVideoIE(InfoExtractor):
             'player_url':         video_swfobj,
         }]
 
+
 class ComedyCentralIE(InfoExtractor):
     """Information extractor for The Daily Show and Colbert Report """
 
@@ -2357,16 +2352,22 @@ class EscapistIE(InfoExtractor):
         videoId = mobj.group('episode')
 
         self.report_extraction(showName)
-        webPage = self._download_webpage(url, showName)
+        webpage = self._download_webpage(url, showName)
+
+        videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+            webpage, u'description', fatal=False)
+        if videoDesc: videoDesc = unescapeHTML(videoDesc)
 
-        descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
-        description = unescapeHTML(descMatch.group(1))
-        imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
-        imgUrl = unescapeHTML(imgMatch.group(1))
-        playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
-        playerUrl = unescapeHTML(playerUrlMatch.group(1))
-        configUrlMatch = re.search('config=(.*)$', playerUrl)
-        configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
+        imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+            webpage, u'thumbnail', fatal=False)
+        if imgUrl: imgUrl = unescapeHTML(imgUrl)
+
+        playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+            webpage, u'player url')
+        playerUrl = unescapeHTML(playerUrl)
+
+        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+        configUrl = compat_urllib_parse.unquote(configUrl)
 
         configJSON = self._download_webpage(configUrl, showName,
                                             u'Downloading configuration',
@@ -2391,7 +2392,7 @@ class EscapistIE(InfoExtractor):
             'title': showName,
             'ext': 'mp4',
             'thumbnail': imgUrl,
-            'description': description,
+            'description': videoDesc,
             'player_url': playerUrl,
         }
 
@@ -2476,26 +2477,17 @@ class XVideosIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-
         # Extract video URL
-        mobj = re.search(r'flv_url=(.+?)&', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = compat_urllib_parse.unquote(mobj.group(1))
-
+        video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
+            webpage, u'video URL'))
 
         # Extract title
-        mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = mobj.group(1)
-
+        video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+            webpage, u'title')
 
         # Extract video thumbnail
-        mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video thumbnail')
-        video_thumbnail = mobj.group(0)
+        video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
+            webpage, u'thumbnail', fatal=False)
 
         info = {
             'id': video_id,
@@ -2652,16 +2644,12 @@ class InfoQIE(InfoExtractor):
         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 
         # Extract title
-        mobj = re.search(r'contentTitle = "(.*?)";', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex(r'contentTitle = "(.*?)";',
+            webpage, u'title')
 
         # Extract description
-        video_description = u'No description available.'
-        mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
-        if mobj is not None:
-            video_description = mobj.group(1)
+        video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+            webpage, u'description', fatal=False)
 
         video_filename = video_url.split('/')[-1]
         video_id, extension = video_filename.split('.')
@@ -2832,15 +2820,16 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                         note='Downloading course info page',
                                         errnote='Unable to download course info page')
 
+            # TODO: implement default_value in search_regex
             m = re.search('<h1>([^<]+)</h1>', coursepage)
             if m:
                 info['title'] = unescapeHTML(m.group(1))
             else:
                 info['title'] = info['id']
 
-            m = re.search('<description>([^<]+)</description>', coursepage)
-            if m:
-                info['description'] = unescapeHTML(m.group(1))
+            info['description'] = self._search_regex('<description>([^<]+)</description>',
+                coursepage, u'description', fatal=False)
+            if info['description']: info['description'] = unescapeHTML(info['description'])
 
             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
             info['list'] = [
@@ -2901,25 +2890,19 @@ class MTVIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract song name')
-        song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
-        mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract performer')
-        performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
-        video_title = performer + ' - ' + song_name
+        song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+            webpage, u'song name', fatal=False)
+        if song_name: song_name = unescapeHTML(song_name)
 
-        mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to mtvn_uri')
-        mtvn_uri = mobj.group(1)
+        video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
-        mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract content id')
-        content_id = mobj.group(1)
+        mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+            webpage, u'mtvn_uri', fatal=False)
+
+        content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
+            webpage, u'content id', fatal=False)
 
         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
         self.report_extraction(video_id)
@@ -3067,20 +3050,15 @@ class XNXXIE(InfoExtractor):
         # Get webpage content
         webpage = self._download_webpage(url, video_id)
 
-        result = re.search(self.VIDEO_URL_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = compat_urllib_parse.unquote(result.group(1))
+        video_url = self._search_regex(self.VIDEO_URL_RE,
+            webpage, u'video URL')
+        video_url = compat_urllib_parse.unquote(video_url)
 
-        result = re.search(self.VIDEO_TITLE_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = result.group(1)
+        video_title = self._search_regex(self.VIDEO_TITLE_RE,
+            webpage, u'title')
 
-        result = re.search(self.VIDEO_THUMB_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video thumbnail')
-        video_thumbnail = result.group(1)
+        video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id': video_id,
@@ -3100,26 +3078,6 @@ class GooglePlusIE(InfoExtractor):
     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
     IE_NAME = u'plus.google'
 
-    def report_extract_entry(self, url):
-        """Report downloading extry"""
-        self.to_screen(u'Downloading entry: %s' % url)
-
-    def report_date(self, upload_date):
-        """Report downloading extry"""
-        self.to_screen(u'Entry date: %s' % upload_date)
-
-    def report_uploader(self, uploader):
-        """Report downloading extry"""
-        self.to_screen(u'Uploader: %s' % uploader)
-
-    def report_title(self, video_title):
-        """Report downloading extry"""
-        self.to_screen(u'Title: %s' % video_title)
-
-    def report_extract_vid_page(self, video_page):
-        """Report information extraction."""
-        self.to_screen(u'Extracting video page: %s' % video_page)
-
     def _real_extract(self, url):
         # Extract id from URL
         mobj = re.match(self._VALID_URL, url)
@@ -3132,47 +3090,35 @@ class GooglePlusIE(InfoExtractor):
         video_extension = 'flv'
 
         # Step 1, Retrieve post webpage to extract further information
-        self.report_extract_entry(post_url)
         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
 
+        self.report_extraction(video_id)
+
         # Extract update date
-        upload_date = None
-        pattern = 'title="Timestamp">(.*?)</a>'
-        mobj = re.search(pattern, webpage)
-        if mobj:
-            upload_date = mobj.group(1)
+        upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+            webpage, u'upload date', fatal=False)
+        if upload_date:
             # Convert timestring to a format suitable for filename
             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
             upload_date = upload_date.strftime('%Y%m%d')
-        self.report_date(upload_date)
 
         # Extract uploader
-        uploader = None
-        pattern = r'rel\="author".*?>(.*?)</a>'
-        mobj = re.search(pattern, webpage)
-        if mobj:
-            uploader = mobj.group(1)
-        self.report_uploader(uploader)
+        uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+            webpage, u'uploader', fatal=False)
 
         # Extract title
         # Get the first line for title
+        # TODO: implement default_value in search_regex
         video_title = u'NA'
         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
         mobj = re.search(pattern, webpage)
         if mobj:
             video_title = mobj.group(1)
-        self.report_title(video_title)
 
         # Step 2, Stimulate clicking the image box to launch video
-        pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
-        mobj = re.search(pattern, webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video page URL')
-
-        video_page = mobj.group(1)
+        video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
+            webpage, u'video page URL')
         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
-        self.report_extract_vid_page(video_page)
-
 
         # Extract video links on video page
         """Extract video links of all sizes"""
@@ -3220,6 +3166,8 @@ class NBAIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
+
+        # TODO: implement default_value in search_regex
         def _findProp(rexp, default=None):
             m = re.search(rexp, webpage)
             if m:
@@ -3383,11 +3331,11 @@ class FunnyOrDieIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
-        if not m:
-            raise ExtractorError(u'Unable to find video information')
-        video_url = unescapeHTML(m.group('url'))
+        video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+            webpage, u'video URL', flags=re.DOTALL)
+        video_url = unescapeHTML(video_url)
 
+        # TODO: implement fallbacks in regex_search
         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
         if not m:
             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
@@ -3395,18 +3343,16 @@ class FunnyOrDieIE(InfoExtractor):
                 raise ExtractorError(u'Cannot find video title')
         title = clean_html(m.group('title'))
 
-        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
-        if m:
-            desc = unescapeHTML(m.group('desc'))
-        else:
-            desc = None
+        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+            webpage, u'description', flags=re.DOTALL)
+        if video_description: video_description = unescapeHTML(video_description)
 
         info = {
             'id': video_id,
             'url': video_url,
             'ext': 'mp4',
             'title': title,
-            'description': desc,
+            'description': video_description,
         }
         return [info]
 
@@ -3462,27 +3408,30 @@ class UstreamIE(InfoExtractor):
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('videoID')
+
         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
         webpage = self._download_webpage(url, video_id)
+
         self.report_extraction(video_id)
-        try:
-            m = re.search(r'data-title="(?P<title>.+)"',webpage)
-            title = m.group('title')
-            m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
-                          webpage, re.DOTALL)
-            uploader = unescapeHTML(m.group('uploader').strip())
-            m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
-            thumb = m.group('thumb')
-        except AttributeError:
-            raise ExtractorError(u'Unable to extract info')
+
+        video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+            webpage, u'title')
+
+        uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+            webpage, u'uploader', fatal=False, flags=re.DOTALL)
+        if uploader: uploader = unescapeHTML(uploader.strip())
+
+        thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+            webpage, u'thumbnail', fatal=False)
+
         info = {
-                'id':video_id,
-                'url':video_url,
+                'id': video_id,
+                'url': video_url,
                 'ext': 'flv',
-                'title': title,
+                'title': video_title,
                 'uploader': uploader,
-                'thumbnail': thumb,
-                  }
+                'thumbnail': thumbnail,
+               }
         return info
 
 class WorldStarHipHopIE(InfoExtractor):
@@ -3490,45 +3439,36 @@ class WorldStarHipHopIE(InfoExtractor):
     IE_NAME = u'WorldStarHipHop'
 
     def _real_extract(self, url):
-        _src_url = r'so\.addVariable\("file","(.*?)"\)'
-
         m = re.match(self._VALID_URL, url)
         video_id = m.group('id')
 
-        webpage_src = self._download_webpage(url, video_id) 
+        webpage_src = self._download_webpage(url, video_id)
 
-        mobj = re.search(_src_url, webpage_src)
+        video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
+            webpage_src, u'video URL')
 
-        if mobj is not None:
-            video_url = mobj.group(1)
-            if 'mp4' in video_url:
-                ext = 'mp4'
-            else:
-                ext = 'flv'
+        if 'mp4' in video_url:
+            ext = 'mp4'
         else:
-            raise ExtractorError(u'Cannot find video url for %s' % video_id)
-
-        mobj = re.search(r"<title>(.*)</title>", webpage_src)
+            ext = 'flv'
 
-        if mobj is None:
-            raise ExtractorError(u'Cannot determine title')
-        title = mobj.group(1)
+        video_title = self._search_regex(r"<title>(.*)</title>",
+            webpage_src, u'title')
 
-        mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        if mobj is not None:
-            thumbnail = mobj.group(1)
-        else:
+        thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+            webpage_src, u'thumbnail', fatal=False)
+
+        if not thumbnail:
             _title = r"""candytitles.*>(.*)</span>"""
             mobj = re.search(_title, webpage_src)
             if mobj is not None:
-                title = mobj.group(1)
-            thumbnail = None
+                video_title = mobj.group(1)
 
         results = [{
                     'id': video_id,
                     'url' : video_url,
-                    'title' : title,
+                    'title' : video_title,
                     'thumbnail' : thumbnail,
                     'ext' : ext,
                     }]
@@ -3542,10 +3482,9 @@ class RBMARadioIE(InfoExtractor):
         video_id = m.group('videoID')
 
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find metadata')
-        json_data = m.group(1)
+
+        json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
+            webpage, u'json data')
 
         try:
             data = json.loads(json_data)
@@ -3592,7 +3531,6 @@ class YouPornIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
             raise ExtractorError(u'Invalid URL: %s' % url)
-
         video_id = mobj.group('videoid')
 
         req = compat_urllib_request.Request(url)
@@ -3600,34 +3538,23 @@ class YouPornIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         # Get the video title
-        result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video title')
-        video_title = result.group('title').strip()
+        video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
+            webpage, u'title').strip()
 
         # Get the video date
-        result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
-        if result is None:
-            self._downloader.report_warning(u'unable to extract video date')
-            upload_date = None
-        else:
-            upload_date = unified_strdate(result.group('date').strip())
+        upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
+            webpage, u'upload date', fatal=False)
+        if upload_date: upload_date = unified_strdate(upload_date.strip())
 
         # Get the video uploader
-        result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
-        if result is None:
-            self._downloader.report_warning(u'unable to extract uploader')
-            video_uploader = None
-        else:
-            video_uploader = result.group('uploader').strip()
-            video_uploader = clean_html( video_uploader )
+        video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
+            webpage, u'uploader', fatal=False)
+        if video_uploader: video_uploader = clean_html(video_uploader.strip())
 
         # Get all of the formats available
         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
-        result = re.search(DOWNLOAD_LIST_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract download list')
-        download_list_html = result.group('download_list').strip()
+        download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
+            webpage, u'download list').strip()
 
         # Get all of the links from the page
         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
@@ -3704,17 +3631,13 @@ class PornotubeIE(InfoExtractor):
 
         # Get the video URL
         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
-        result = re.search(VIDEO_URL_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = compat_urllib_parse.unquote(result.group('url'))
+        video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
+        video_url = compat_urllib_parse.unquote(video_url)
 
         #Get the uploaded date
         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
-        result = re.search(VIDEO_UPLOADED_RE, webpage)
-        if result is None:
-            raise ExtractorError(u'Unable to extract video title')
-        upload_date = unified_strdate(result.group('date'))
+        upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+        if upload_date: upload_date = unified_strdate(upload_date)
 
         info = {'id': video_id,
                 'url': video_url,
@@ -3741,10 +3664,8 @@ class YouJizzIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         # Get the video title
-        result = re.search(r'<title>(?P<title>.*)</title>', webpage)
-        if result is None:
-            raise ExtractorError(u'ERROR: unable to extract video title')
-        video_title = result.group('title').strip()
+        video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+            webpage, u'title').strip()
 
         # Get the embed page
         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
@@ -3757,10 +3678,8 @@ class YouJizzIE(InfoExtractor):
         webpage = self._download_webpage(embed_page_url, video_id)
 
         # Get the video URL
-        result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
-        if result is None:
-            raise ExtractorError(u'ERROR: unable to extract video url')
-        video_url = result.group('source')
+        video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+            webpage, u'video URL')
 
         info = {'id': video_id,
                 'url': video_url,
@@ -3783,10 +3702,7 @@ class EightTracksIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
 
-        m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
-        if not m:
-            raise ExtractorError(u'Cannot find trax information')
-        json_like = m.group(1)
+        json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
         data = json.loads(json_like)
 
         session = str(random.randint(0, 1000000000))
@@ -3822,18 +3738,24 @@ class KeekIE(InfoExtractor):
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('videoID')
+
         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
-        title = unescapeHTML(m.group('title'))
-        m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
-        uploader = clean_html(m.group('uploader'))
+
+        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
+
+        uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+            webpage, u'uploader', fatal=False)
+        if uploader: uploader = clean_html(uploader)
+
         info = {
                 'id': video_id,
                 'url': video_url,
                 'ext': 'mp4',
-                'title': title,
+                'title': video_title,
                 'thumbnail': thumbnail,
                 'uploader': uploader
         }
@@ -3980,10 +3902,10 @@ class SpiegelIE(InfoExtractor):
         video_id = m.group('videoID')
 
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find title')
-        video_title = unescapeHTML(m.group(1))
+
+        video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
         xml_code = self._download_webpage(xml_url, video_id,
@@ -4019,35 +3941,27 @@ class LiveLeakIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        m = re.search(r'file: "(.*?)",', webpage)
-        if not m:
-            raise ExtractorError(u'Unable to find video url')
-        video_url = m.group(1)
+        video_url = self._search_regex(r'file: "(.*?)",',
+            webpage, u'video URL')
 
-        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
-        if not m:
-            raise ExtractorError(u'Cannot find video title')
-        title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
+        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
 
-        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
-        if m:
-            desc = unescapeHTML(m.group('desc'))
-        else:
-            desc = None
+        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+            webpage, u'description', fatal=False)
+        if video_description: video_description = unescapeHTML(video_description)
 
-        m = re.search(r'By:.*?(\w+)</a>', webpage)
-        if m:
-            uploader = clean_html(m.group(1))
-        else:
-            uploader = None
+        video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+            webpage, u'uploader', fatal=False)
 
         info = {
             'id':  video_id,
             'url': video_url,
             'ext': 'mp4',
-            'title': title,
-            'description': desc,
-            'uploader': uploader
+            'title': video_title,
+            'description': video_description,
+            'uploader': video_uploader
         }
 
         return [info]
@@ -4105,23 +4019,24 @@ class TumblrIE(InfoExtractor):
         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
         video = re.search(re_video, webpage)
         if video is None:
-            self.to_screen("No video found")
-            return []
+           raise ExtractorError(u'Unable to extract video')
         video_url = video.group('video_url')
         ext = video.group('ext')
 
-        re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
-        thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
+        video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
+            webpage, u'thumbnail', fatal=False)  # We pick the first poster
+        if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 
         # The only place where you can get a title, it's not complete,
         # but searching in other places doesn't work for all videos
-        re_title = r'<title>(?P<title>.*?)</title>'
-        title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
+        video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+            webpage, u'title', flags=re.DOTALL)
+        video_title = unescapeHTML(video_title)
 
         return [{'id': video_id,
                  'url': video_url,
-                 'title': title,
-                 'thumbnail': thumb,
+                 'title': video_title,
+                 'thumbnail': video_thumbnail,
                  'ext': ext
                  }]
 
@@ -4135,7 +4050,7 @@ class BandcampIE(InfoExtractor):
         # We get the link to the free download page
         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
         if m_download is None:
-            raise ExtractorError(u'No free songs founded')
+            raise ExtractorError(u'No free songs found')
 
         download_link = m_download.group(1)
         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
@@ -4163,10 +4078,10 @@ class BandcampIE(InfoExtractor):
 
         track_info = {'id':id,
                       'title' : info[u'title'],
-                      'ext' : 'mp3',
-                      'url' : final_url,
+                      'ext' :   'mp3',
+                      'url' :   final_url,
                       'thumbnail' : info[u'thumb_url'],
-                      'uploader' : info[u'artist']
+                      'uploader' :  info[u'artist']
                       }
 
         return [track_info]
@@ -4183,17 +4098,14 @@ class RedTubeIE(InfoExtractor):
         video_id = mobj.group('id')
         video_extension = 'mp4'        
         webpage = self._download_webpage(url, video_id)
+
         self.report_extraction(video_id)
-        mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
 
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
+        video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+            webpage, u'video URL')
 
-        video_url = mobj.group(1)
-        mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+            webpage, u'title')
 
         return [{
             'id':       video_id,
@@ -4214,15 +4126,13 @@ class InaIE(InfoExtractor):
         video_extension = 'mp4'
         webpage = self._download_webpage(mrss_url, video_id)
 
-        mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        video_url = mobj.group(1)
+        self.report_extraction(video_id)
 
-        mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+            webpage, u'video URL')
+
+        video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
+            webpage, u'title')
 
         return [{
             'id':       video_id,
@@ -4244,27 +4154,17 @@ class HowcastIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video URL')
-        video_url = mobj.group(1)
+        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
+            webpage, u'video URL')
 
-        mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1) or mobj.group(2)
+        video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+            webpage, u'title')
 
-        mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
-        if mobj is None:
-            self._downloader.report_warning(u'unable to extract description')
-            video_description = None
-        else:
-            video_description = mobj.group(1) or mobj.group(2)
+        video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+            webpage, u'description', fatal=False)
 
-        mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1)
+        thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id':       video_id,
@@ -4280,7 +4180,6 @@ class VineIE(InfoExtractor):
     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 
     def _real_extract(self, url):
-
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
@@ -4289,25 +4188,17 @@ class VineIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video URL')
-        video_url = mobj.group(1)
+        video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+            webpage, u'video URL')
 
-        mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+            webpage, u'title')
 
-        mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1)
+        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+            webpage, u'thumbnail', fatal=False)
 
-        mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract uploader')
-        uploader = mobj.group(1)
+        uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+            webpage, u'uploader', fatal=False, flags=re.DOTALL)
 
         return [{
             'id':        video_id,
@@ -4330,18 +4221,13 @@ class FlickrIE(InfoExtractor):
         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
         webpage = self._download_webpage(webpage_url, video_id)
 
-        mobj = re.search(r"photo_secret: '(\w+)'", webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video secret')
-        secret = mobj.group(1)
+        secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 
         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 
-        mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract node_id')
-        node_id = mobj.group(1)
+        node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+            first_xml, u'node_id')
 
         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
@@ -4353,22 +4239,14 @@ class FlickrIE(InfoExtractor):
             raise ExtractorError(u'Unable to extract video url')
         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 
-        mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1) or mobj.group(2)
+        video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+            webpage, u'video title')
 
-        mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
-        if mobj is None:
-            self._downloader.report_warning(u'unable to extract description')
-            video_description = None
-        else:
-            video_description = mobj.group(1) or mobj.group(2)
+        video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+            webpage, u'description', fatal=False)
 
-        mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1) or mobj.group(2)
+        thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id':          video_id,
@@ -4390,32 +4268,25 @@ class TeamcocoIE(InfoExtractor):
         url_title = mobj.group('url_title')
         webpage = self._download_webpage(url, url_title)
 
-        mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
-        video_id = mobj.group(1)
+        video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+            webpage, u'video id')
 
         self.report_extraction(video_id)
 
-        mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+            webpage, u'title')
 
-        mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail')
-        thumbnail = mobj.group(1)
+        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+            webpage, u'thumbnail', fatal=False)
 
-        mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract description')
-        description = mobj.group(1)
+        video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+            webpage, u'description', fatal=False)
 
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
-        mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video url')
-        video_url = mobj.group(1)
+
+        video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+            data, u'video URL')
 
         return [{
             'id':          video_id,
@@ -4423,7 +4294,7 @@ class TeamcocoIE(InfoExtractor):
             'ext':         'mp4',
             'title':       video_title,
             'thumbnail':   thumbnail,
-            'description': description,
+            'description': video_description,
         }]
         
 class XHamsterIE(InfoExtractor):
-- 
cgit v1.2.3


From 468e2e926b8d1f55d6ce67fee67e33a7fa6d8371 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Thu, 6 Jun 2013 14:35:08 +0200
Subject: implement fallbacks and defaults in _search_regex

---
 youtube_dl/InfoExtractors.py | 84 +++++++++++++++++++++++---------------------
 youtube_dl/utils.py          |  3 ++
 2 files changed, 47 insertions(+), 40 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 4d13c17e4..fbf40f3ca 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -191,19 +191,37 @@ class InfoExtractor(object):
             video_info['title'] = playlist_title
         return video_info
 
-    def _search_regex(self, pattern, text, name, fatal=True, flags=0):
-        """Extract a field from some text based on regex"""
-        mobj = re.search(pattern, text, flags)
-        if mobj is None and fatal:
+    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+        """
+        Perform a regex search on the given string, using a single or a list of
+        patterns returning the first matching group.
+        In case of failure return a default value or raise a WARNING or a
+        ExtractorError, depending on fatal, specifying the field name.
+        """
+        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+            mobj = re.search(pattern, string, flags)
+        else:
+            for p in pattern:
+                mobj = re.search(p, string, flags)
+                if mobj: break
+
+        if sys.stderr.isatty() and os.name != 'nt':
+            _name = u'\033[0;34m%s\033[0m' % name
+        else:
+            _name = name
+
+        if mobj:
+            # return the first matching group
+            return next(g for g in mobj.groups() if g is not None)
+        elif default is not None:
+            return default
+        elif fatal:
             raise ExtractorError(u'Unable to extract %s; '
-                u'please report this issue on GitHub.' % name)
-        elif mobj is None:
+                u'please report this issue on GitHub.' % _name)
+        else:
             self._downloader.report_warning(u'unable to extract %s; '
-                u'please report this issue on GitHub.' % name)
+                u'please report this issue on GitHub.' % _name)
             return None
-        else:
-            # return the first matched group
-            return next(g for g in mobj.groups() if g is not None)
 
 class SearchInfoExtractor(InfoExtractor):
     """
@@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                         note='Downloading course info page',
                                         errnote='Unable to download course info page')
 
-            # TODO: implement default_value in search_regex
-            m = re.search('<h1>([^<]+)</h1>', coursepage)
-            if m:
-                info['title'] = unescapeHTML(m.group(1))
-            else:
-                info['title'] = info['id']
+            info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
+            info['title'] = unescapeHTML(info['title'])
 
             info['description'] = self._search_regex('<description>([^<]+)</description>',
                 coursepage, u'description', fatal=False)
@@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor):
 
         # Extract title
         # Get the first line for title
-        # TODO: implement default_value in search_regex
-        video_title = u'NA'
-        pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
-        mobj = re.search(pattern, webpage)
-        if mobj:
-            video_title = mobj.group(1)
+        video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+            webpage, 'title', default=u'NA')
 
         # Step 2, Stimulate clicking the image box to launch video
         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
@@ -3167,23 +3177,21 @@ class NBAIE(InfoExtractor):
 
         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 
-        # TODO: implement default_value in search_regex
-        def _findProp(rexp, default=None):
-            m = re.search(rexp, webpage)
-            if m:
-                return unescapeHTML(m.group(1))
-            else:
-                return default
-
         shortened_video_id = video_id.rpartition('/')[2]
-        title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+        title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+
+        uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+
+        description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
+
         info = {
             'id': shortened_video_id,
             'url': video_url,
             'ext': 'mp4',
             'title': title,
-            'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
-            'description': _findProp(r'<div class="description">(.*?)</h1>'),
+            'uploader_date': uploader_date,
+            'description': description,
         }
         return [info]
 
@@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor):
             webpage, u'video URL', flags=re.DOTALL)
         video_url = unescapeHTML(video_url)
 
-        # TODO: implement fallbacks in regex_search
-        m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
-        if not m:
-            m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
-            if not m:
-                raise ExtractorError(u'Cannot find video title')
-        title = clean_html(m.group('title'))
+        title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+            r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
+        title = clean_html(title)
 
         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
             webpage, u'description', flags=re.DOTALL)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 63d9d0ae5..3a8dcf4d3 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -154,6 +154,9 @@ def compat_ord(c):
     if type(c) is int: return c
     else: return ord(c)
 
+# This is not clearly defined otherwise
+compiled_regex_type = type(re.compile(''))
+
 std_headers = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
-- 
cgit v1.2.3


From 476203d025dd2619ea9f9e2f99ffce507dec6596 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Thu, 6 Jun 2013 15:07:05 +0200
Subject: print WARNINGs during test + minor fix to NBAIE

---
 youtube_dl/InfoExtractors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index fbf40f3ca..0f1880756 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3183,7 +3183,7 @@ class NBAIE(InfoExtractor):
 
         uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 
-        description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
+        description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 
         info = {
             'id': shortened_video_id,
-- 
cgit v1.2.3


From be95cac157a75da1a0fa512b36eb90bc2c28cc96 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Fri, 7 Jun 2013 11:19:27 +0200
Subject: raise exceptions on warnings during tests - and solve a couple of
 them

---
 youtube_dl/InfoExtractors.py | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 0f1880756..bd6fce3b6 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3161,7 +3161,7 @@ class GooglePlusIE(InfoExtractor):
         }]
 
 class NBAIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
     IE_NAME = u'nba'
 
     def _real_extract(self, url):
@@ -3170,8 +3170,6 @@ class NBAIE(InfoExtractor):
             raise ExtractorError(u'Invalid URL: %s' % url)
 
         video_id = mobj.group(1)
-        if video_id.endswith('/index.html'):
-            video_id = video_id[:-len('/index.html')]
 
         webpage = self._download_webpage(url, video_id)
 
@@ -3181,7 +3179,8 @@ class NBAIE(InfoExtractor):
         title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 
-        uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+        # It isn't there in the HTML it returns to us
+        # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 
         description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 
@@ -3190,7 +3189,7 @@ class NBAIE(InfoExtractor):
             'url': video_url,
             'ext': 'mp4',
             'title': title,
-            'uploader_date': uploader_date,
+            # 'uploader_date': uploader_date,
             'description': description,
         }
         return [info]
@@ -3541,19 +3540,22 @@ class YouPornIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        # Get the video title
-        video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
-            webpage, u'title').strip()
-
-        # Get the video date
-        upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
-            webpage, u'upload date', fatal=False)
-        if upload_date: upload_date = unified_strdate(upload_date.strip())
+        # Get JSON parameters
+        json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+        try:
+            params = json.loads(json_params)
+        except:
+            raise ExtractorError(u'Invalid JSON')
 
-        # Get the video uploader
-        video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
-            webpage, u'uploader', fatal=False)
-        if video_uploader: video_uploader = clean_html(video_uploader.strip())
+        self.report_extraction(video_id)
+        try:
+            video_title = params['title']
+            upload_date = unified_strdate(params['release_date_f'])
+            video_description = params['description']
+            video_uploader = params['submitted_by']
+            thumbnail = params['thumbnails'][0]['image']
+        except KeyError:
+            raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 
         # Get all of the formats available
         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
@@ -3592,9 +3594,8 @@ class YouPornIE(InfoExtractor):
                 'title': title,
                 'ext': extension,
                 'format': format,
-                'thumbnail': None,
-                'description': None,
-                'player_url': None
+                'thumbnail': thumbnail,
+                'description': video_description
             })
 
         if self._downloader.params.get('listformats', None):
-- 
cgit v1.2.3


From 8409501206e37d57f01e5fe72bfc54a5562e4e0a Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Fri, 7 Jun 2013 11:46:03 +0200
Subject: use search_regex in new IEs

---
 youtube_dl/InfoExtractors.py | 50 +++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index bd6fce3b6..5d54e93e7 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3347,7 +3347,7 @@ class FunnyOrDieIE(InfoExtractor):
         title = clean_html(title)
 
         video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', flags=re.DOTALL)
+            webpage, u'description', fatal=False, flags=re.DOTALL)
         if video_description: video_description = unescapeHTML(video_description)
 
         info = {
@@ -4301,7 +4301,7 @@ class TeamcocoIE(InfoExtractor):
             'thumbnail':   thumbnail,
             'description': video_description,
         }]
-        
+
 class XHamsterIE(InfoExtractor):
     """Information Extractor for xHamster"""
     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
@@ -4310,8 +4310,9 @@ class XHamsterIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
 
         video_id = mobj.group('id')
-        mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
         webpage = self._download_webpage(mrss_url, video_id)
+
         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
         if mobj is None:
             raise ExtractorError(u'Unable to extract media URL')
@@ -4321,32 +4322,26 @@ class XHamsterIE(InfoExtractor):
             video_url = mobj.group('server')+'/key='+mobj.group('file')
         video_extension = video_url.split('.')[-1]
 
-        mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = unescapeHTML(mobj.group('title'))
+        video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+            webpage, u'title')
+        video_title = unescapeHTML(video_title)
 
-        mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
-        if mobj is None:
-            video_description = u''
-        else:
-            video_description = unescapeHTML(mobj.group('description'))
+        video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+            webpage, u'description', fatal=False)
+        if video_description: video_description = unescapeHTML(video_description)
 
         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract upload date')
-        video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-
-        mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
-        if mobj is None:
-            video_uploader_id = u'anonymous'
+        if mobj:
+            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
         else:
-            video_uploader_id = mobj.group('uploader_id')
+            video_upload_date = None
+            self._downloader.report_warning(u'Unable to extract upload date')
 
-        mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail URL')
-        video_thumbnail = mobj.group('thumbnail')
+        video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+            webpage, u'uploader id', default=u'anonymous')
+
+        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
+            webpage, u'thumbnail', fatal=False)
 
         return [{
             'id':       video_id,
@@ -4377,10 +4372,9 @@ class HypemIE(InfoExtractor):
         cookie = urlh.headers.get('Set-Cookie', '')
 
         self.report_extraction(track_id)
-        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extrack tracks')
-        html_tracks = mobj.group(1).strip()
+
+        html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+            response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
         try:
             track_list = json.loads(html_tracks)
             track = track_list[u'tracks'][0]
-- 
cgit v1.2.3


From 8b59a9861040482c9af58e85fb397353ea2e8080 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Fri, 7 Jun 2013 12:10:02 +0200
Subject: XHamster: Can't see the description anywhere in the UI

---
 youtube_dl/InfoExtractors.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 5d54e93e7..0d7db013b 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -4326,9 +4326,10 @@ class XHamsterIE(InfoExtractor):
             webpage, u'title')
         video_title = unescapeHTML(video_title)
 
-        video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
-            webpage, u'description', fatal=False)
-        if video_description: video_description = unescapeHTML(video_description)
+        # Can't see the description anywhere in the UI
+        # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+        #     webpage, u'description', fatal=False)
+        # if video_description: video_description = unescapeHTML(video_description)
 
         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
         if mobj:
@@ -4348,7 +4349,7 @@ class XHamsterIE(InfoExtractor):
             'url':      video_url,
             'ext':      video_extension,
             'title':    video_title,
-            'description': video_description,
+            # 'description': video_description,
             'upload_date': video_upload_date,
             'uploader_id': video_uploader_id,
             'thumbnail': video_thumbnail
-- 
cgit v1.2.3


From f5a290eed949b7726a8d745960bbe9c6b8b7de52 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sat, 8 Jun 2013 09:56:34 +0200
Subject: print "please report this issue on GitHub" on every ExtractorError

---
 youtube_dl/InfoExtractors.py | 3 +--
 youtube_dl/utils.py          | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 0d7db013b..86cc7c748 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -216,8 +216,7 @@ class InfoExtractor(object):
         elif default is not None:
             return default
         elif fatal:
-            raise ExtractorError(u'Unable to extract %s; '
-                u'please report this issue on GitHub.' % _name)
+            raise ExtractorError(u'Unable to extract %s' % _name)
         else:
             self._downloader.report_warning(u'unable to extract %s; '
                 u'please report this issue on GitHub.' % _name)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 3a8dcf4d3..718ee3aae 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -472,6 +472,7 @@ class ExtractorError(Exception):
     """Error during info extraction."""
     def __init__(self, msg, tb=None):
         """ tb, if given, is the original traceback (so that it can be printed out). """
+        msg = msg + u'; please report this issue on GitHub.'
         super(ExtractorError, self).__init__(msg)
         self.traceback = tb
         self.exc_info = sys.exc_info()  # preserve original exception
-- 
cgit v1.2.3


From d5979c5d55b0df11973b9a2b6630fd676e5726d1 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 11:55:08 +0200
Subject: do not ask the user to report network errors

---
 youtube_dl/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 718ee3aae..66ae41e31 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -12,7 +12,7 @@ import sys
 import traceback
 import zlib
 import email.utils
-import json
+import socket
 import datetime
 
 try:
@@ -472,8 +472,11 @@ class ExtractorError(Exception):
     """Error during info extraction."""
     def __init__(self, msg, tb=None):
         """ tb, if given, is the original traceback (so that it can be printed out). """
-        msg = msg + u'; please report this issue on GitHub.'
+
+        if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
+            msg = msg + u'; please report this issue on GitHub.'
         super(ExtractorError, self).__init__(msg)
+
         self.traceback = tb
         self.exc_info = sys.exc_info()  # preserve original exception
 
-- 
cgit v1.2.3


From 979a9dd4c4d46e0f2b11bc4bcac51ad8d446d186 Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 11:57:13 +0200
Subject: _html_search_regex with clean_html superpowers

---
 youtube_dl/InfoExtractors.py | 151 ++++++++++++++++++++-----------------------
 1 file changed, 71 insertions(+), 80 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 86cc7c748..6060a5988 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -222,6 +222,16 @@ class InfoExtractor(object):
                 u'please report this issue on GitHub.' % _name)
             return None
 
+    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+        """
+        Like _search_regex, but strips HTML tags and unescapes entities.
+        """
+        res = self._search_regex(pattern, string, name, default, fatal, flags)
+        if res:
+            return clean_html(res).strip()
+        else:
+            return res
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
@@ -1923,9 +1933,8 @@ class FacebookIE(InfoExtractor):
         video_duration = int(video_data['video_duration'])
         thumbnail = video_data['thumbnail_src']
 
-        video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+        video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
         info = {
             'id': video_id,
@@ -2087,7 +2096,7 @@ class MyVideoIE(InfoExtractor):
             self.report_extraction(video_id)
             video_url = mobj.group(1) + '.flv'
 
-            video_title = self._search_regex('<title>([^<]+)</title>',
+            video_title = self._html_search_regex('<title>([^<]+)</title>',
                 webpage, u'title')
 
             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
@@ -2169,7 +2178,7 @@ class MyVideoIE(InfoExtractor):
         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 
-        video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+        video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
             webpage, u'title')
 
         return [{
@@ -2371,17 +2380,14 @@ class EscapistIE(InfoExtractor):
         self.report_extraction(showName)
         webpage = self._download_webpage(url, showName)
 
-        videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+        videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
             webpage, u'description', fatal=False)
-        if videoDesc: videoDesc = unescapeHTML(videoDesc)
 
-        imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
             webpage, u'thumbnail', fatal=False)
-        if imgUrl: imgUrl = unescapeHTML(imgUrl)
 
-        playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
             webpage, u'player url')
-        playerUrl = unescapeHTML(playerUrl)
 
         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
         configUrl = compat_urllib_parse.unquote(configUrl)
@@ -2499,7 +2505,7 @@ class XVideosIE(InfoExtractor):
             webpage, u'video URL'))
 
         # Extract title
-        video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+        video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
             webpage, u'title')
 
         # Extract video thumbnail
@@ -2665,7 +2671,7 @@ class InfoQIE(InfoExtractor):
             webpage, u'title')
 
         # Extract description
-        video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+        video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
             webpage, u'description', fatal=False)
 
         video_filename = video_url.split('/')[-1]
@@ -2837,12 +2843,10 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                         note='Downloading course info page',
                                         errnote='Unable to download course info page')
 
-            info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
-            info['title'] = unescapeHTML(info['title'])
+            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 
-            info['description'] = self._search_regex('<description>([^<]+)</description>',
+            info['description'] = self._html_search_regex('<description>([^<]+)</description>',
                 coursepage, u'description', fatal=False)
-            if info['description']: info['description'] = unescapeHTML(info['description'])
 
             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
             info['list'] = [
@@ -2903,15 +2907,13 @@ class MTVIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+        song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
             webpage, u'song name', fatal=False)
-        if song_name: song_name = unescapeHTML(song_name)
 
-        video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+        video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
-        mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+        mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
             webpage, u'mtvn_uri', fatal=False)
 
         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
@@ -3067,7 +3069,7 @@ class XNXXIE(InfoExtractor):
             webpage, u'video URL')
         video_url = compat_urllib_parse.unquote(video_url)
 
-        video_title = self._search_regex(self.VIDEO_TITLE_RE,
+        video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
             webpage, u'title')
 
         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
@@ -3108,7 +3110,7 @@ class GooglePlusIE(InfoExtractor):
         self.report_extraction(video_id)
 
         # Extract update date
-        upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
             webpage, u'upload date', fatal=False)
         if upload_date:
             # Convert timestring to a format suitable for filename
@@ -3116,12 +3118,12 @@ class GooglePlusIE(InfoExtractor):
             upload_date = upload_date.strftime('%Y%m%d')
 
         # Extract uploader
-        uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
             webpage, u'uploader', fatal=False)
 
         # Extract title
         # Get the first line for title
-        video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
             webpage, 'title', default=u'NA')
 
         # Step 2, Stimulate clicking the image box to launch video
@@ -3175,13 +3177,13 @@ class NBAIE(InfoExtractor):
         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 
         shortened_video_id = video_id.rpartition('/')[2]
-        title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 
         # It isn't there in the HTML it returns to us
-        # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+        # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 
-        description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
+        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 
         info = {
             'id': shortened_video_id,
@@ -3337,17 +3339,14 @@ class FunnyOrDieIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+        video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
             webpage, u'video URL', flags=re.DOTALL)
-        video_url = unescapeHTML(video_url)
 
-        title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+        title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
-        title = clean_html(title)
 
-        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
             webpage, u'description', fatal=False, flags=re.DOTALL)
-        if video_description: video_description = unescapeHTML(video_description)
 
         info = {
             'id': video_id,
@@ -3416,14 +3415,13 @@ class UstreamIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+        video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
             webpage, u'title')
 
-        uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+        uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
             webpage, u'uploader', fatal=False, flags=re.DOTALL)
-        if uploader: uploader = unescapeHTML(uploader.strip())
 
-        thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+        thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
             webpage, u'thumbnail', fatal=False)
 
         info = {
@@ -3454,11 +3452,11 @@ class WorldStarHipHopIE(InfoExtractor):
         else:
             ext = 'flv'
 
-        video_title = self._search_regex(r"<title>(.*)</title>",
+        video_title = self._html_search_regex(r"<title>(.*)</title>",
             webpage_src, u'title')
 
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+        thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
             webpage_src, u'thumbnail', fatal=False)
 
         if not thumbnail:
@@ -3640,7 +3638,7 @@ class PornotubeIE(InfoExtractor):
 
         #Get the uploaded date
         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
-        upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
         if upload_date: upload_date = unified_strdate(upload_date)
 
         info = {'id': video_id,
@@ -3668,7 +3666,7 @@ class YouJizzIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         # Get the video title
-        video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
             webpage, u'title').strip()
 
         # Get the embed page
@@ -3747,13 +3745,11 @@ class KeekIE(InfoExtractor):
         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
         webpage = self._download_webpage(url, video_id)
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
-        uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+        uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
             webpage, u'uploader', fatal=False)
-        if uploader: uploader = clean_html(uploader)
 
         info = {
                 'id': video_id,
@@ -3907,9 +3903,8 @@ class SpiegelIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
         xml_code = self._download_webpage(xml_url, video_id,
@@ -3948,15 +3943,13 @@ class LiveLeakIE(InfoExtractor):
         video_url = self._search_regex(r'file: "(.*?)",',
             webpage, u'video URL')
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title')
-        video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title').replace('LiveLeak.com -', '').strip()
 
-        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
             webpage, u'description', fatal=False)
-        if video_description: video_description = unescapeHTML(video_description)
 
-        video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+        video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
             webpage, u'uploader', fatal=False)
 
         info = {
@@ -4033,9 +4026,8 @@ class TumblrIE(InfoExtractor):
 
         # The only place where you can get a title, it's not complete,
         # but searching in other places doesn't work for all videos
-        video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
             webpage, u'title', flags=re.DOTALL)
-        video_title = unescapeHTML(video_title)
 
         return [{'id': video_id,
                  'url': video_url,
@@ -4105,10 +4097,10 @@ class RedTubeIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+        video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
             webpage, u'video URL')
 
-        video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+        video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
             webpage, u'title')
 
         return [{
@@ -4132,7 +4124,7 @@ class InaIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+        video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
             webpage, u'video URL')
 
         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
@@ -4161,13 +4153,13 @@ class HowcastIE(InfoExtractor):
         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
             webpage, u'video URL')
 
-        video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+        video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
             webpage, u'title')
 
-        video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
             webpage, u'description', fatal=False)
 
-        thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+        thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
             webpage, u'thumbnail', fatal=False)
 
         return [{
@@ -4192,16 +4184,16 @@ class VineIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
-        video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+        video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
             webpage, u'video URL')
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
             webpage, u'title')
 
-        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
             webpage, u'thumbnail', fatal=False)
 
-        uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 
         return [{
@@ -4230,7 +4222,7 @@ class FlickrIE(InfoExtractor):
         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 
-        node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+        node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
             first_xml, u'node_id')
 
         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
@@ -4243,13 +4235,13 @@ class FlickrIE(InfoExtractor):
             raise ExtractorError(u'Unable to extract video url')
         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 
-        video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
             webpage, u'video title')
 
-        video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
             webpage, u'description', fatal=False)
 
-        thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
             webpage, u'thumbnail', fatal=False)
 
         return [{
@@ -4272,24 +4264,24 @@ class TeamcocoIE(InfoExtractor):
         url_title = mobj.group('url_title')
         webpage = self._download_webpage(url, url_title)
 
-        video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+        video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
             webpage, u'video id')
 
         self.report_extraction(video_id)
 
-        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
             webpage, u'title')
 
-        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
             webpage, u'thumbnail', fatal=False)
 
-        video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
             webpage, u'description', fatal=False)
 
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 
-        video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
             data, u'video URL')
 
         return [{
@@ -4321,12 +4313,11 @@ class XHamsterIE(InfoExtractor):
             video_url = mobj.group('server')+'/key='+mobj.group('file')
         video_extension = video_url.split('.')[-1]
 
-        video_title = self._search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
             webpage, u'title')
-        video_title = unescapeHTML(video_title)
 
         # Can't see the description anywhere in the UI
-        # video_description = self._search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+        # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
         #     webpage, u'description', fatal=False)
         # if video_description: video_description = unescapeHTML(video_description)
 
@@ -4337,7 +4328,7 @@ class XHamsterIE(InfoExtractor):
             video_upload_date = None
             self._downloader.report_warning(u'Unable to extract upload date')
 
-        video_uploader_id = self._search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
             webpage, u'uploader id', default=u'anonymous')
 
         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
@@ -4373,7 +4364,7 @@ class HypemIE(InfoExtractor):
 
         self.report_extraction(track_id)
 
-        html_tracks = self._search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+        html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
         try:
             track_list = json.loads(html_tracks)
-- 
cgit v1.2.3


From 78d3442b1209d3858cfea1f7ca958f661784b5ab Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Sun, 9 Jun 2013 14:21:42 +0200
Subject: test: extend the reach of info_dict checking

* print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected
* make it possible to put the crc32 in tests.json if the field is too long
* complete the "info_dict" fields in existing tests
* fixed the bugs catched doing this
---
 youtube_dl/InfoExtractors.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 6060a5988..24e9c4cc7 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -2377,8 +2377,8 @@ class EscapistIE(InfoExtractor):
         showName = mobj.group('showname')
         videoId = mobj.group('episode')
 
-        self.report_extraction(showName)
-        webpage = self._download_webpage(url, showName)
+        self.report_extraction(videoId)
+        webpage = self._download_webpage(url, videoId)
 
         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
             webpage, u'description', fatal=False)
@@ -2389,10 +2389,13 @@ class EscapistIE(InfoExtractor):
         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
             webpage, u'player url')
 
+        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+            webpage, u'player url').split(' : ')[-1]
+
         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
         configUrl = compat_urllib_parse.unquote(configUrl)
 
-        configJSON = self._download_webpage(configUrl, showName,
+        configJSON = self._download_webpage(configUrl, videoId,
                                             u'Downloading configuration',
                                             u'unable to download configuration')
 
@@ -2412,7 +2415,7 @@ class EscapistIE(InfoExtractor):
             'url': videoUrl,
             'uploader': showName,
             'upload_date': None,
-            'title': showName,
+            'title': title,
             'ext': 'mp4',
             'thumbnail': imgUrl,
             'description': videoDesc,
@@ -3581,14 +3584,14 @@ class YouPornIE(InfoExtractor):
             size = format[0]
             bitrate = format[1]
             format = "-".join( format )
-            title = u'%s-%s-%s' % (video_title, size, bitrate)
+            # title = u'%s-%s-%s' % (video_title, size, bitrate)
 
             formats.append({
                 'id': video_id,
                 'url': video_url,
                 'uploader': video_uploader,
                 'upload_date': upload_date,
-                'title': title,
+                'title': video_title,
                 'ext': extension,
                 'format': format,
                 'thumbnail': thumbnail,
@@ -4328,7 +4331,7 @@ class XHamsterIE(InfoExtractor):
             video_upload_date = None
             self._downloader.report_warning(u'Unable to extract upload date')
 
-        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)',
+        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
             webpage, u'uploader id', default=u'anonymous')
 
         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
-- 
cgit v1.2.3


From af44c9486255f16ab180a9e45aaab06a6b38bdde Mon Sep 17 00:00:00 2001
From: Filippo Valsorda <filippo.valsorda@gmail.com>
Date: Mon, 17 Jun 2013 19:25:35 +0200
Subject: use _search_regex in GenericIE

---
 youtube_dl/InfoExtractors.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'youtube_dl')

diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 24e9c4cc7..3c95012b1 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1430,16 +1430,12 @@ class GenericIE(InfoExtractor):
         #   Site Name | Video Title
         #   Video Title - Tagline | Site Name
         # and so on and so forth; it's just not practical
-        mobj = re.search(r'<title>(.*)</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title')
 
         # video uploader is domain name
-        mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_uploader = mobj.group(1)
+        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+            url, u'video uploader')
 
         return [{
             'id':       video_id,
-- 
cgit v1.2.3