diff options
author | Philipp Hagemeister <phihag@phihag.de> | 2014-04-30 02:23:51 +0200 |
---|---|---|
committer | Philipp Hagemeister <phihag@phihag.de> | 2014-04-30 02:23:51 +0200 |
commit | b30b8698ea11e85079cc9e392cdf26f4e61671c4 (patch) | |
tree | d1495a6e7e32ff083831ae389818a0a0f3971932 /youtube_dl/extractor/generic.py | |
parent | f1f25be6dbed3a2eb73819c55a5b49d8e001dfec (diff) |
[generic] Allow multiple matches for generic hits (Fixes #2818)
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r-- | youtube_dl/extractor/generic.py | 85 |
1 files changed, 46 insertions, 39 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cfb009d79..58092da38 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -637,70 +637,77 @@ class GenericIE(InfoExtractor): return self.url_result(smotri_url, 'Smotri') # Start with something easy: JW Player in SWFObject - mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) - if mobj is None: + found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) + if not found: # Look for gorilla-vid style embedding - mobj = re.search(r'''(?sx) + found = re.findall(r'''(?sx) (?: jw_plugins| JWPlayerOptions| jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup ) .*?file\s*:\s*["\'](.*?)["\']''', webpage) - if mobj is None: + if not found: # Broaden the search a little bit - mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) - if mobj is None: - # Broaden the search a little bit: JWPlayer JS loader - mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) - - if mobj is None: + found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + if not found: + # Broaden the findall a little bit: JWPlayer JS loader + found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + if not found: # Try to find twitter cards info - mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) - if mobj is None: + found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) + if not found: # We look for Open Graph info: # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) - m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) + m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: - mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) - if mobj is None: + found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) + if not found: # HTML5 video - mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) - if mobj is None: - mobj = re.search( + found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage) + if not found: + found = re.findall( r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")' r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"', webpage) - if mobj: - new_url = mobj.group(1) + if found: + new_url = found.group(1) self.report_following_redirect(new_url) return { '_type': 'url', 'url': new_url, } - if mobj is None: + if not found: raise ExtractorError('Unsupported URL: %s' % url) - # It's possible that one of the regexes - # matched, but returned an empty group: - if mobj.group(1) is None: - raise ExtractorError('Did not find a valid video URL at %s' % url) + entries = [] + for video_url in found: + video_url = compat_urlparse.urljoin(url, video_url) + video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) - video_url = mobj.group(1) - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) + # Sometimes, jwplayer extraction will result in a YouTube URL + if YoutubeIE.suitable(video_url): + entries.append(self.url_result(video_url, 'Youtube')) + continue - # Sometimes, jwplayer extraction will result in a YouTube URL - if YoutubeIE.suitable(video_url): - return self.url_result(video_url, 'Youtube') + # here's a fun little line of code for you: + video_id = os.path.splitext(video_id)[0] - # here's a fun little line of code for you: - video_id = os.path.splitext(video_id)[0] + entries.append({ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'title': video_title, + }) + + if len(entries) == 1: + return entries[1] + else: + for num, e in enumerate(entries, start=1): + e['title'] = '%s (%d)' % (e['title'], num) + return { + '_type': 'playlist', + 'entries': entries, + } - return { - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'title': video_title, - } |