diff options
author | Sergey M․ <dstftw@gmail.com> | 2014-10-09 19:26:23 +0700 |
---|---|---|
committer | Sergey M․ <dstftw@gmail.com> | 2014-10-09 19:26:23 +0700 |
commit | ced659bb4d066c782a009d370daff0fedb7b1006 (patch) | |
tree | a63b5552d4b4afe84fd7519a618725f6828a2441 /youtube_dl/extractor/generic.py | |
parent | 842cca7d56c155f515ba0919d709a412291ef52b (diff) |
[generic] Ignore some non-video file extensions during generic extraction (Closes #3900)
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r-- | youtube_dl/extractor/generic.py | 34 |
1 files changed, 19 insertions, 15 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c16da70f1..dfc2ef4e7 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -847,47 +847,51 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'MLB') + def check_video(vurl): + vpath = compat_urlparse.urlparse(vurl).path + vext = determine_ext(vpath) + return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') + + def filter_video(urls): + return list(filter(check_video, urls)) + # Start with something easy: JW Player in SWFObject - found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) + found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) if not found: # Look for gorilla-vid style embedding - found = re.findall(r'''(?sx) + found = filter_video(re.findall(r'''(?sx) (?: jw_plugins| JWPlayerOptions| jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup ) - .*?file\s*:\s*["\'](.*?)["\']''', webpage) + .*?file\s*:\s*["\'](.*?)["\']''', webpage)) if not found: # Broaden the search a little bit - found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if not found: # Broaden the findall a little bit: JWPlayer JS loader - found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + found = filter_video(re.findall( + r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if not found: # Flow player - found = re.findall(r'''(?xs) + found = filter_video(re.findall(r'''(?xs) flowplayer\("[^"]+",\s* \{[^}]+?\}\s*, \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] - ''', webpage) + ''', webpage)) if not found: # Try to find twitter cards info - found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) + found = filter_video(re.findall( + r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) if not found: # We look for Open Graph info: # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: - def check_video(vurl): - vpath = compat_urlparse.urlparse(vurl).path - vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg') - found = list(filter( - check_video, - re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))) + found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)) if not found: # HTML5 video found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]+)? src="([^"]+)"', webpage) |