From ac3e9394e76c0e8baeff1bc77eb67fa184ceb81c Mon Sep 17 00:00:00 2001 From: Anna Bernardi Date: Thu, 6 Jun 2013 13:27:27 +0200 Subject: Implement search_regex from #847 --- youtube_dl/InfoExtractors.py | 635 +++++++++++++++++-------------------------- 1 file changed, 253 insertions(+), 382 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index b40edf5fb..4d13c17e4 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,6 +191,20 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info + def _search_regex(self, pattern, text, name, fatal=True, flags=0): + """Extract a field from some text based on regex""" + mobj = re.search(pattern, text, flags) + if mobj is None and fatal: + raise ExtractorError(u'Unable to extract %s; ' + u'please report this issue on GitHub.' % name) + elif mobj is None: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % name) + return None + else: + # return the first matched group + return next(g for g in mobj.groups() if g is not None) + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -964,18 +978,13 @@ class PhotobucketIE(InfoExtractor): }] # We try looking in other parts of the webpage - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL + video_url = self._search_regex(r'', + webpage, u'video URL') mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') return [{ @@ -1803,10 +1812,7 @@ class DepositFilesIE(InfoExtractor): file_extension = os.path.splitext(file_url)[1][1:] # Search for file title - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - file_title = mobj.group(1).decode('utf-8') + file_title = self._search_regex(r'', webpage, u'title') return [{ 'id': file_id.decode('utf-8'), @@ -1900,10 +1906,9 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - m = re.search('

([^<]+)

', webpage) - if not m: - raise ExtractorError(u'Cannot find title in webpage') - video_title = unescapeHTML(m.group(1)) + video_title = self._search_regex('

([^<]+)

', + webpage, u'title') + video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2065,15 +2070,10 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - mobj = re.search('([^<]+)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex('([^<]+)', + webpage, u'title') - mobj = re.search('[.](.+?)$', video_url) - if mobj is None: - raise ExtractorError(u'Unable to extract extention') - video_ext = mobj.group(1) + video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') return [{ 'id': video_id, @@ -2121,25 +2121,23 @@ class MyVideoIE(InfoExtractor): # extracting infos self.report_extraction(video_id) + video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract rtmpurl') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) - if 'myvideo2flash' in video_rtmpurl: - self._downloader.report_warning(u'forcing RTMPT ...') - video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://') - - # extract non rtmp videos - if (video_rtmpurl is None) or (video_rtmpurl == ''): + if mobj: + video_url = compat_urllib_parse.unquote(mobj.group(1)) + if 'myvideo2flash' in video_url: + self._downloader.report_warning(u'forcing RTMPT ...') + video_url = video_url.replace('rtmpe://', 'rtmpt://') + + if not video_url: + # extract non rtmp videos mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError(u'unable to extract url') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) + video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) - mobj = re.search('source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_file = compat_urllib_parse.unquote(mobj.group(1)) + video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file') + video_file = compat_urllib_parse.unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') @@ -2151,20 +2149,16 @@ class MyVideoIE(InfoExtractor): video_filepath + video_file ).replace('.f4m', '.m3u8') - mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_swfobj = compat_urllib_parse.unquote(mobj.group(1)) + video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') + video_swfobj = compat_urllib_parse.unquote(video_swfobj) - mobj = re.search("(.*?)", webpage) - if mobj is None: - raise ExtractorError(u'unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex("(.*?)", + webpage, u'title') return [{ 'id': video_id, - 'url': video_rtmpurl, - 'tc_url': video_rtmpurl, + 'url': video_url, + 'tc_url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, @@ -2175,6 +2169,7 @@ class MyVideoIE(InfoExtractor): 'player_url': video_swfobj, }] + class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ @@ -2357,16 +2352,22 @@ class EscapistIE(InfoExtractor): videoId = mobj.group('episode') self.report_extraction(showName) - webPage = self._download_webpage(url, showName) + webpage = self._download_webpage(url, showName) + + videoDesc = self._search_regex('(.*?)\s+-\s+XVID', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) - + video_title = self._search_regex(r'(.*?)\s+-\s+XVID', + webpage, u'title') # Extract video thumbnail - mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = mobj.group(0) + video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', + webpage, u'thumbnail', fatal=False) info = { 'id': video_id, @@ -2652,16 +2644,12 @@ class InfoQIE(InfoExtractor): video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title - mobj = re.search(r'contentTitle = "(.*?)";', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) + video_title = self._search_regex(r'contentTitle = "(.*?)";', + webpage, u'title') # Extract description - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage) - if mobj is not None: - video_description = mobj.group(1) + video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') @@ -2832,15 +2820,16 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') + # TODO: implement default_value in search_regex m = re.search('<h1>([^<]+)</h1>', coursepage) if m: info['title'] = unescapeHTML(m.group(1)) else: info['title'] = info['id'] - m = re.search('<description>([^<]+)</description>', coursepage) - if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = self._search_regex('<description>([^<]+)</description>', + coursepage, u'description', fatal=False) + if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2901,25 +2890,19 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract song name') - song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract performer') - performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name + song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + webpage, u'song name', fatal=False) + if song_name: song_name = unescapeHTML(song_name) - mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to mtvn_uri') - mtvn_uri = mobj.group(1) + video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract content id') - content_id = mobj.group(1) + mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + webpage, u'mtvn_uri', fatal=False) + + content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', + webpage, u'content id', fatal=False) videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri self.report_extraction(video_id) @@ -3067,20 +3050,15 @@ class XNXXIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) - result = re.search(self.VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group(1)) + video_url = self._search_regex(self.VIDEO_URL_RE, + webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) - result = re.search(self.VIDEO_TITLE_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group(1) + video_title = self._search_regex(self.VIDEO_TITLE_RE, + webpage, u'title') - result = re.search(self.VIDEO_THUMB_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = result.group(1) + video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -3100,26 +3078,6 @@ class GooglePlusIE(InfoExtractor): _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' - def report_extract_entry(self, url): - """Report downloading extry""" - self.to_screen(u'Downloading entry: %s' % url) - - def report_date(self, upload_date): - """Report downloading extry""" - self.to_screen(u'Entry date: %s' % upload_date) - - def report_uploader(self, uploader): - """Report downloading extry""" - self.to_screen(u'Uploader: %s' % uploader) - - def report_title(self, video_title): - """Report downloading extry""" - self.to_screen(u'Title: %s' % video_title) - - def report_extract_vid_page(self, video_page): - """Report information extraction.""" - self.to_screen(u'Extracting video page: %s' % video_page) - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -3132,47 +3090,35 @@ class GooglePlusIE(InfoExtractor): video_extension = 'flv' # Step 1, Retrieve post webpage to extract further information - self.report_extract_entry(post_url) webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage') + self.report_extraction(video_id) + # Extract update date - upload_date = None - pattern = 'title="Timestamp">(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - upload_date = mobj.group(1) + upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + webpage, u'upload date', fatal=False) + if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") upload_date = upload_date.strftime('%Y%m%d') - self.report_date(upload_date) # Extract uploader - uploader = None - pattern = r'rel\="author".*?>(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - uploader = mobj.group(1) - self.report_uploader(uploader) + uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + webpage, u'uploader', fatal=False) # Extract title # Get the first line for title + # TODO: implement default_value in search_regex video_title = u'NA' pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]' mobj = re.search(pattern, webpage) if mobj: video_title = mobj.group(1) - self.report_title(video_title) # Step 2, Stimulate clicking the image box to launch video - pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' - mobj = re.search(pattern, webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video page URL') - - video_page = mobj.group(1) + video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') - self.report_extract_vid_page(video_page) - # Extract video links on video page """Extract video links of all sizes""" @@ -3220,6 +3166,8 @@ class NBAIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + + # TODO: implement default_value in search_regex def _findProp(rexp, default=None): m = re.search(rexp, webpage) if m: @@ -3383,11 +3331,11 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL) - if not m: - raise ExtractorError(u'Unable to find video information') - video_url = unescapeHTML(m.group('url')) + video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + webpage, u'video URL', flags=re.DOTALL) + video_url = unescapeHTML(video_url) + # TODO: implement fallbacks in regex_search m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) if not m: m = re.search(r'<title>(?P<title>[^<]+?)', webpage) @@ -3395,18 +3343,16 @@ class FunnyOrDieIE(InfoExtractor): raise ExtractorError(u'Cannot find video title') title = clean_html(m.group('title')) - m = re.search(r'.+)"',webpage) - title = m.group('title') - m = re.search(r'data-content-type="channel".*?>(?P.*?)', - webpage, re.DOTALL) - uploader = unescapeHTML(m.group('uploader').strip()) - m = re.search(r'.+)"', + webpage, u'title') + + uploader = self._search_regex(r'data-content-type="channel".*?>(?P.*?)', + webpage, u'uploader', fatal=False, flags=re.DOTALL) + if uploader: uploader = unescapeHTML(uploader.strip()) + + thumbnail = self._search_regex(r'(.*)", webpage_src) + ext = 'flv' - if mobj is None: - raise ExtractorError(u'Cannot determine title') - title = mobj.group(1) + video_title = self._search_regex(r"(.*)", + webpage_src, u'title') - mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src) # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - if mobj is not None: - thumbnail = mobj.group(1) - else: + thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + webpage_src, u'thumbnail', fatal=False) + + if not thumbnail: _title = r"""candytitles.*>(.*)""" mobj = re.search(_title, webpage_src) if mobj is not None: - title = mobj.group(1) - thumbnail = None + video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, - 'title' : title, + 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] @@ -3542,10 +3482,9 @@ class RBMARadioIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'', webpage) - if not m: - raise ExtractorError(u'Cannot find metadata') - json_data = m.group(1) + + json_data = self._search_regex(r'', + webpage, u'json data') try: data = json.loads(json_data) @@ -3592,7 +3531,6 @@ class YouPornIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) @@ -3600,34 +3538,23 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(req, video_id) # Get the video title - result = re.search(r'(?P.*)</h1>', webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group('title').strip() + video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>', + webpage, u'title').strip() # Get the video date - result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract video date') - upload_date = None - else: - upload_date = unified_strdate(result.group('date').strip()) + upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', + webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date.strip()) # Get the video uploader - result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract uploader') - video_uploader = None - else: - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) + video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', + webpage, u'uploader', fatal=False) + if video_uploader: video_uploader = clean_html(video_uploader.strip()) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - result = re.search(DOWNLOAD_LIST_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract download list') - download_list_html = result.group('download_list').strip() + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, + webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' @@ -3704,17 +3631,13 @@ class PornotubeIE(InfoExtractor): # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - result = re.search(VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group('url')) + video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') + video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - result = re.search(VIDEO_UPLOADED_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - upload_date = unified_strdate(result.group('date')) + upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, @@ -3741,10 +3664,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - result = re.search(r'<title>(?P<title>.*)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') - video_title = result.group('title').strip() + video_title = self._search_regex(r'(?P<title>.*)', + webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) @@ -3757,10 +3678,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video url') - video_url = result.group('source') + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, @@ -3783,10 +3702,7 @@ class EightTracksIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) - if not m: - raise ExtractorError(u'Cannot find trax information') - json_like = m.group(1) + json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) @@ -3822,18 +3738,24 @@ class KeekIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - m = re.search(r'[\S\s]+?

(?P.+?)

', webpage) - uploader = clean_html(m.group('uploader')) + + video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + webpage, u'uploader', fatal=False) + if uploader: uploader = clean_html(uploader) + info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } @@ -3980,10 +3902,10 @@ class SpiegelIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'
(.*?)
', webpage) - if not m: - raise ExtractorError(u'Cannot find title') - video_title = unescapeHTML(m.group(1)) + + video_title = self._search_regex(r'
(.*?)
', + webpage, u'title') + video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -4019,35 +3941,27 @@ class LiveLeakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m = re.search(r'file: "(.*?)",', webpage) - if not m: - raise ExtractorError(u'Unable to find video url') - video_url = m.group(1) + video_url = self._search_regex(r'file: "(.*?)",', + webpage, u'video URL') - m = re.search(r'', webpage) - if m: - uploader = clean_html(m.group(1)) - else: - uploader = None + video_uploader = self._search_regex(r'By:.*?(\w+)', + webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': desc, - 'uploader': uploader + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader } return [info] @@ -4105,23 +4019,24 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?Phttp://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - self.to_screen("No video found") - return [] + raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') - re_thumb = r'posters(.*?)\[\\x22(?P.*?)\\x22' # We pick the first poster - thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '') + video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P.*?)\\x22', + webpage, u'thumbnail', fatal=False) # We pick the first poster + if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - re_title = r'(?P<title>.*?)' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) + video_title = self._search_regex(r'(?P<title>.*?)', + webpage, u'title', flags=re.DOTALL) + video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, - 'title': title, - 'thumbnail': thumb, + 'title': video_title, + 'thumbnail': video_thumbnail, 'ext': ext }] @@ -4135,7 +4050,7 @@ class BandcampIE(InfoExtractor): # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: - raise ExtractorError(u'No free songs founded') + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P\d*?)$', @@ -4163,10 +4078,10 @@ class BandcampIE(InfoExtractor): track_info = {'id':id, 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, + 'ext' : 'mp3', + 'url' : final_url, 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] + 'uploader' : info[u'artist'] } return [track_info] @@ -4183,17 +4098,14 @@ class RedTubeIE(InfoExtractor): video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - mobj = re.search(r'',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + video_url = self._search_regex(r'', + webpage, u'video URL') - video_url = mobj.group(1) - mobj = re.search('

(.+)

',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex('

(.+?)

', + webpage, u'title') return [{ 'id': video_id, @@ -4214,15 +4126,13 @@ class InaIE(InfoExtractor): video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'.*?)]]>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_url = self._search_regex(r'.*?)]]>', + webpage, u'title') return [{ 'id': video_id, @@ -4244,27 +4154,17 @@ class HowcastIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) + video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', + webpage, u'video URL') - mobj = re.search(r'\w+)' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -4289,25 +4188,17 @@ class VineIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'.*?

(.+?)

', webpage, re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader') - uploader = mobj.group(1) + uploader = self._search_regex(r'
.*?

(.+?)

', + webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, @@ -4330,18 +4221,13 @@ class FlickrIE(InfoExtractor): webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) - mobj = re.search(r"photo_secret: '(\w+)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video secret') - secret = mobj.group(1) + secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - mobj = re.search(r'(\d+-\d+)', first_xml) - if mobj is None: - raise ExtractorError(u'Unable to extract node_id') - node_id = mobj.group(1) + node_id = self._search_regex(r'(\d+-\d+)', + first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') @@ -4353,22 +4239,14 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - mobj = re.search(r'(.*?)', data) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = mobj.group(1) + + video_url = self._search_regex(r'(.*?)', + data, u'video URL') return [{ 'id': video_id, @@ -4423,7 +4294,7 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, - 'description': description, + 'description': video_description, }] class XHamsterIE(InfoExtractor): -- cgit v1.2.3 From 468e2e926b8d1f55d6ce67fee67e33a7fa6d8371 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Thu, 6 Jun 2013 14:35:08 +0200 Subject: implement fallbacks and defaults in _search_regex --- youtube_dl/InfoExtractors.py | 84 +++++++++++++++++++++++--------------------- youtube_dl/utils.py | 3 ++ 2 files changed, 47 insertions(+), 40 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4d13c17e4..fbf40f3ca 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,19 +191,37 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, text, name, fatal=True, flags=0): - """Extract a field from some text based on regex""" - mobj = re.search(pattern, text, flags) - if mobj is None and fatal: + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % name) - elif mobj is None: + u'please report this issue on GitHub.' % _name) + else: self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on GitHub.' % name) + u'please report this issue on GitHub.' % _name) return None - else: - # return the first matched group - return next(g for g in mobj.groups() if g is not None) class SearchInfoExtractor(InfoExtractor): """ @@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - # TODO: implement default_value in search_regex - m = re.search('

([^<]+)

', coursepage) - if m: - info['title'] = unescapeHTML(m.group(1)) - else: - info['title'] = info['id'] + info['title'] = self._search_regex('

([^<]+)

', coursepage, 'title', default=info['id']) + info['title'] = unescapeHTML(info['title']) info['description'] = self._search_regex('([^<]+)', coursepage, u'description', fatal=False) @@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor): # Extract title # Get the first line for title - # TODO: implement default_value in search_regex - video_title = u'NA' - pattern = r'Date:
(.*?)
', webpage, 'upload_date', fatal=False) + + description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': _findProp(r'Date: (.*?)
'), - 'description': _findProp(r'
(.*?)'), + 'uploader_date': uploader_date, + 'description': description, } return [info] @@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor): webpage, u'video URL', flags=re.DOTALL) video_url = unescapeHTML(video_url) - # TODO: implement fallbacks in regex_search - m = re.search(r"

(?P.*?)</h1>", webpage, flags=re.DOTALL) - if not m: - m = re.search(r'<title>(?P<title>[^<]+?)', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = clean_html(m.group('title')) + title = self._search_regex((r"

(?P.*?)</h1>", + r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) + title = clean_html(title) video_description = self._search_regex(r' Date: Thu, 6 Jun 2013 15:07:05 +0200 Subject: print WARNINGs during test + minor fix to NBAIE --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index fbf40f3ca..0f1880756 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3183,7 +3183,7 @@ class NBAIE(InfoExtractor): uploader_date = self._search_regex(r'Date: (.*?)

', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + description = self._search_regex(r'', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, -- cgit v1.2.3 From be95cac157a75da1a0fa512b36eb90bc2c28cc96 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 7 Jun 2013 11:19:27 +0200 Subject: raise exceptions on warnings during tests - and solve a couple of them --- youtube_dl/InfoExtractors.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0f1880756..bd6fce3b6 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3161,7 +3161,7 @@ class GooglePlusIE(InfoExtractor): }] class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): @@ -3170,8 +3170,6 @@ class NBAIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) - if video_id.endswith('/index.html'): - video_id = video_id[:-len('/index.html')] webpage = self._download_webpage(url, video_id) @@ -3181,7 +3179,8 @@ class NBAIE(InfoExtractor): title = self._search_regex(r'Date: (.*?)
', webpage, 'upload_date', fatal=False) + # It isn't there in the HTML it returns to us + # uploader_date = self._search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) description = self._search_regex(r'', webpage, 'description', fatal=False) @@ -3190,7 +3189,7 @@ class NBAIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': uploader_date, + # 'uploader_date': uploader_date, 'description': description, } return [info] @@ -3541,19 +3540,22 @@ class YouPornIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - # Get the video title - video_title = self._search_regex(r'(?P.*)</h1>', - webpage, u'title').strip() - - # Get the video date - upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', - webpage, u'upload date', fatal=False) - if upload_date: upload_date = unified_strdate(upload_date.strip()) + # Get JSON parameters + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') - # Get the video uploader - video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', - webpage, u'uploader', fatal=False) - if video_uploader: video_uploader = clean_html(video_uploader.strip()) + self.report_extraction(video_id) + try: + video_title = params['title'] + upload_date = unified_strdate(params['release_date_f']) + video_description = params['description'] + video_uploader = params['submitted_by'] + thumbnail = params['thumbnails'][0]['image'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' @@ -3592,9 +3594,8 @@ class YouPornIE(InfoExtractor): 'title': title, 'ext': extension, 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None + 'thumbnail': thumbnail, + 'description': video_description }) if self._downloader.params.get('listformats', None): -- cgit v1.2.3 From 8409501206e37d57f01e5fe72bfc54a5562e4e0a Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Fri, 7 Jun 2013 11:46:03 +0200 Subject: use search_regex in new IEs --- youtube_dl/InfoExtractors.py | 50 +++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index bd6fce3b6..5d54e93e7 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3347,7 +3347,7 @@ class FunnyOrDieIE(InfoExtractor): title = clean_html(title) video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', - webpage, u'description', flags=re.DOTALL) + webpage, u'description', fatal=False, flags=re.DOTALL) if video_description: video_description = unescapeHTML(video_description) info = { @@ -4301,7 +4301,7 @@ class TeamcocoIE(InfoExtractor): 'thumbnail': thumbnail, 'description': video_description, }] - + class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' @@ -4310,8 +4310,9 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url='http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -4321,32 +4322,26 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) + video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'Description: (?P[^<]+)', webpage) - if mobj is None: - video_description = u'' - else: - video_description = unescapeHTML(mobj.group('description')) + video_description = self._search_regex(r'Description: (?P[^<]+)', + webpage, u'description', fatal=False) + if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract upload date') - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - - mobj = re.search(r']+>(?P[^>]+)', webpage) - if mobj is None: - video_uploader_id = u'anonymous' + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: - video_uploader_id = mobj.group('uploader_id') + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') - mobj = re.search(r'\'image\':\'(?P[^\']+)\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail URL') - video_thumbnail = mobj.group('thumbnail') + video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -4377,10 +4372,9 @@ class HypemIE(InfoExtractor): cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) - mobj = re.search(r'', response, flags=re.MULTILINE|re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extrack tracks') - html_tracks = mobj.group(1).strip() + + html_tracks = self._search_regex(r'', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] -- cgit v1.2.3 From 8b59a9861040482c9af58e85fb397353ea2e8080 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 7 Jun 2013 12:10:02 +0200 Subject: XHamster: Can't see the description anywhere in the UI --- youtube_dl/InfoExtractors.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 5d54e93e7..0d7db013b 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4326,9 +4326,10 @@ class XHamsterIE(InfoExtractor): webpage, u'title') video_title = unescapeHTML(video_title) - video_description = self._search_regex(r'Description: (?P[^<]+)', - webpage, u'description', fatal=False) - if video_description: video_description = unescapeHTML(video_description) + # Can't see the description anywhere in the UI + # video_description = self._search_regex(r'Description: (?P[^<]+)', + # webpage, u'description', fatal=False) + # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: @@ -4348,7 +4349,7 @@ class XHamsterIE(InfoExtractor): 'url': video_url, 'ext': video_extension, 'title': video_title, - 'description': video_description, + # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail -- cgit v1.2.3 From f5a290eed949b7726a8d745960bbe9c6b8b7de52 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sat, 8 Jun 2013 09:56:34 +0200 Subject: print "please report this issue on GitHub" on every ExtractorError --- youtube_dl/InfoExtractors.py | 3 +-- youtube_dl/utils.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0d7db013b..86cc7c748 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -216,8 +216,7 @@ class InfoExtractor(object): elif default is not None: return default elif fatal: - raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % _name) + raise ExtractorError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' u'please report this issue on GitHub.' % _name) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3a8dcf4d3..718ee3aae 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -472,6 +472,7 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception -- cgit v1.2.3 From d5979c5d55b0df11973b9a2b6630fd676e5726d1 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:55:08 +0200 Subject: do not ask the user to report network errors --- youtube_dl/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 718ee3aae..66ae41e31 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,7 +12,7 @@ import sys import traceback import zlib import email.utils -import json +import socket import datetime try: @@ -472,8 +472,11 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ - msg = msg + u'; please report this issue on GitHub.' + + if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) + self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception -- cgit v1.2.3 From 979a9dd4c4d46e0f2b11bc4bcac51ad8d446d186 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:57:13 +0200 Subject: _html_search_regex with clean_html superpowers --- youtube_dl/InfoExtractors.py | 151 ++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 80 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 86cc7c748..6060a5988 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -222,6 +222,16 @@ class InfoExtractor(object): u'please report this issue on GitHub.' % _name) return None + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -1923,9 +1933,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._search_regex('

([^<]+)

', + video_title = self._html_search_regex('

([^<]+)

', webpage, u'title') - video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2087,7 +2096,7 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - video_title = self._search_regex('([^<]+)', + video_title = self._html_search_regex('([^<]+)', webpage, u'title') video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') @@ -2169,7 +2178,7 @@ class MyVideoIE(InfoExtractor): video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') video_swfobj = compat_urllib_parse.unquote(video_swfobj) - video_title = self._search_regex("(.*?)", + video_title = self._html_search_regex("(.*?)", webpage, u'title') return [{ @@ -2371,17 +2380,14 @@ class EscapistIE(InfoExtractor): self.report_extraction(showName) webpage = self._download_webpage(url, showName) - videoDesc = self._search_regex('(.*?)\s+-\s+XVID', + video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', webpage, u'title') # Extract video thumbnail @@ -2665,7 +2671,7 @@ class InfoQIE(InfoExtractor): webpage, u'title') # Extract description - video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] @@ -2837,12 +2843,10 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['title'] = unescapeHTML(info['title']) + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['description'] = self._search_regex('<description>([^<]+)</description>', + info['description'] = self._html_search_regex('<description>([^<]+)</description>', coursepage, u'description', fatal=False) - if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2903,15 +2907,13 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage, u'song name', fatal=False) - if song_name: song_name = unescapeHTML(song_name) - video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', webpage, u'title') - video_title = unescapeHTML(video_title) - mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage, u'mtvn_uri', fatal=False) content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', @@ -3067,7 +3069,7 @@ class XNXXIE(InfoExtractor): webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) - video_title = self._search_regex(self.VIDEO_TITLE_RE, + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, webpage, u'title') video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, @@ -3108,7 +3110,7 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename @@ -3116,12 +3118,12 @@ class GooglePlusIE(InfoExtractor): upload_date = upload_date.strftime('%Y%m%d') # Extract uploader - uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video @@ -3175,13 +3177,13 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._search_regex(r'<meta property="og:title" content="(.*?)"', + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us - # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, @@ -3337,17 +3339,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, u'video URL', flags=re.DOTALL) - video_url = unescapeHTML(video_url) - title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", + title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - title = clean_html(title) - video_description = self._search_regex(r'.+)"', + video_title = self._html_search_regex(r'data-title="(?P.+)"', webpage, u'title') - uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', webpage, u'uploader', fatal=False, flags=re.DOTALL) - if uploader: uploader = unescapeHTML(uploader.strip()) - thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage, u'thumbnail', fatal=False) info = { @@ -3454,11 +3452,11 @@ class WorldStarHipHopIE(InfoExtractor): else: ext = 'flv' - video_title = self._search_regex(r"<title>(.*)", + video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', webpage_src, u'thumbnail', fatal=False) if not thumbnail: @@ -3640,7 +3638,7 @@ class PornotubeIE(InfoExtractor): #Get the uploaded date VIDEO_UPLOADED_RE = r'
Added (?P[0-9\/]+) by' - upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, @@ -3668,7 +3666,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - video_title = self._search_regex(r'(?P<title>.*)', + video_title = self._html_search_regex(r'(?P<title>.*)', webpage, u'title').strip() # Get the embed page @@ -3747,13 +3745,11 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + uploader = self._html_search_regex(r'
[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) - if uploader: uploader = clean_html(uploader) info = { 'id': video_id, @@ -3907,9 +3903,8 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'
(.*?)
', + video_title = self._html_search_regex(r'
(.*?)
', webpage, u'title') - video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -3948,15 +3943,13 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._search_regex(r'', + video_uploader = self._html_search_regex(r'By:.*?(\w+)
', webpage, u'uploader', fatal=False) info = { @@ -4033,9 +4026,8 @@ class TumblrIE(InfoExtractor): # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - video_title = self._search_regex(r'(?P<title>.*?)', + video_title = self._html_search_regex(r'(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) - video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, @@ -4105,10 +4097,10 @@ class RedTubeIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'', + video_url = self._html_search_regex(r'', webpage, u'video URL') - video_title = self._search_regex('

(.+?)

', + video_title = self._html_search_regex('

(.+?)

', webpage, u'title') return [{ @@ -4132,7 +4124,7 @@ class InaIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'.*?)]]>', @@ -4161,13 +4153,13 @@ class HowcastIE(InfoExtractor): video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') - video_title = self._search_regex(r'.*?

(.+?)

', + uploader = self._html_search_regex(r'
.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ @@ -4230,7 +4222,7 @@ class FlickrIE(InfoExtractor): first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - node_id = self._search_regex(r'(\d+-\d+)', + node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' @@ -4243,13 +4235,13 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - video_title = self._search_regex(r'(.*?)', + video_url = self._html_search_regex(r'(.*?)', data, u'video URL') return [{ @@ -4321,12 +4313,11 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') - video_title = unescapeHTML(video_title) # Can't see the description anywhere in the UI - # video_description = self._search_regex(r'Description: (?P[^<]+)', + # video_description = self._html_search_regex(r'Description: (?P[^<]+)', # webpage, u'description', fatal=False) # if video_description: video_description = unescapeHTML(video_description) @@ -4337,7 +4328,7 @@ class XHamsterIE(InfoExtractor): video_upload_date = None self._downloader.report_warning(u'Unable to extract upload date') - video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^>]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', @@ -4373,7 +4364,7 @@ class HypemIE(InfoExtractor): self.report_extraction(track_id) - html_tracks = self._search_regex(r'', + html_tracks = self._html_search_regex(r'', response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) -- cgit v1.2.3 From 78d3442b1209d3858cfea1f7ca958f661784b5ab Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 14:21:42 +0200 Subject: test: extend the reach of info_dict checking * print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected * make it possible to put the crc32 in tests.json if the field is too long * complete the "info_dict" fields in existing tests * fixed the bugs catched doing this --- youtube_dl/InfoExtractors.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 6060a5988..24e9c4cc7 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -2377,8 +2377,8 @@ class EscapistIE(InfoExtractor): showName = mobj.group('showname') videoId = mobj.group('episode') - self.report_extraction(showName) - webpage = self._download_webpage(url, showName) + self.report_extraction(videoId) + webpage = self._download_webpage(url, videoId) videoDesc = self._html_search_regex(']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', -- cgit v1.2.3 From af44c9486255f16ab180a9e45aaab06a6b38bdde Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 17 Jun 2013 19:25:35 +0200 Subject: use _search_regex in GenericIE --- youtube_dl/InfoExtractors.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'youtube_dl') diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 24e9c4cc7..3c95012b1 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1430,16 +1430,12 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - mobj = re.search(r'(.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'(.*)', + webpage, u'video title') # video uploader is domain name - mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_uploader = mobj.group(1) + video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', + url, u'video uploader') return [{ 'id': video_id, -- cgit v1.2.3