aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
authorPhilipp Hagemeister <phihag@phihag.de>2014-04-30 02:23:51 +0200
committerPhilipp Hagemeister <phihag@phihag.de>2014-04-30 02:23:51 +0200
commitb30b8698ea11e85079cc9e392cdf26f4e61671c4 (patch)
treed1495a6e7e32ff083831ae389818a0a0f3971932 /youtube_dl/extractor/generic.py
parentf1f25be6dbed3a2eb73819c55a5b49d8e001dfec (diff)
[generic] Allow multiple matches for generic hits (Fixes #2818)
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py85
1 files changed, 46 insertions, 39 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index cfb009d79..58092da38 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -637,70 +637,77 @@ class GenericIE(InfoExtractor):
return self.url_result(smotri_url, 'Smotri')
# Start with something easy: JW Player in SWFObject
- mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
- if mobj is None:
+ found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
+ if not found:
# Look for gorilla-vid style embedding
- mobj = re.search(r'''(?sx)
+ found = re.findall(r'''(?sx)
(?:
jw_plugins|
JWPlayerOptions|
jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
)
.*?file\s*:\s*["\'](.*?)["\']''', webpage)
- if mobj is None:
+ if not found:
# Broaden the search a little bit
- mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
- if mobj is None:
- # Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
-
- if mobj is None:
+ found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+ if not found:
+ # Broaden the findall a little bit: JWPlayer JS loader
+ found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+ if not found:
# Try to find twitter cards info
- mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
- if mobj is None:
+ found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
+ if not found:
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
- m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+ m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
- mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
- if mobj is None:
+ found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
+ if not found:
# HTML5 video
- mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
- if mobj is None:
- mobj = re.search(
+ found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+ if not found:
+ found = re.findall(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
webpage)
- if mobj:
- new_url = mobj.group(1)
+ if found:
+ new_url = found.group(1)
self.report_following_redirect(new_url)
return {
'_type': 'url',
'url': new_url,
}
- if mobj is None:
+ if not found:
raise ExtractorError('Unsupported URL: %s' % url)
- # It's possible that one of the regexes
- # matched, but returned an empty group:
- if mobj.group(1) is None:
- raise ExtractorError('Did not find a valid video URL at %s' % url)
+ entries = []
+ for video_url in found:
+ video_url = compat_urlparse.urljoin(url, video_url)
+ video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
- video_url = mobj.group(1)
- video_url = compat_urlparse.urljoin(url, video_url)
- video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+ # Sometimes, jwplayer extraction will result in a YouTube URL
+ if YoutubeIE.suitable(video_url):
+ entries.append(self.url_result(video_url, 'Youtube'))
+ continue
- # Sometimes, jwplayer extraction will result in a YouTube URL
- if YoutubeIE.suitable(video_url):
- return self.url_result(video_url, 'Youtube')
+ # here's a fun little line of code for you:
+ video_id = os.path.splitext(video_id)[0]
- # here's a fun little line of code for you:
- video_id = os.path.splitext(video_id)[0]
+ entries.append({
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': video_uploader,
+ 'title': video_title,
+ })
+
+ if len(entries) == 1:
+ return entries[1]
+ else:
+ for num, e in enumerate(entries, start=1):
+ e['title'] = '%s (%d)' % (e['title'], num)
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ }
- return {
- 'id': video_id,
- 'url': video_url,
- 'uploader': video_uploader,
- 'title': video_title,
- }