diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-04-30 02:23:51 +0200 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-04-30 02:23:51 +0200 | 
| commit | b30b8698ea11e85079cc9e392cdf26f4e61671c4 (patch) | |
| tree | d1495a6e7e32ff083831ae389818a0a0f3971932 | |
| parent | f1f25be6dbed3a2eb73819c55a5b49d8e001dfec (diff) | |
[generic] Allow multiple matches for generic hits (Fixes #2818)
| -rw-r--r-- | youtube_dl/extractor/generic.py | 85 | 
1 files changed, 46 insertions, 39 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cfb009d79..58092da38 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -637,70 +637,77 @@ class GenericIE(InfoExtractor):              return self.url_result(smotri_url, 'Smotri')          # Start with something easy: JW Player in SWFObject -        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) -        if mobj is None: +        found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) +        if not found:              # Look for gorilla-vid style embedding -            mobj = re.search(r'''(?sx) +            found = re.findall(r'''(?sx)                  (?:                      jw_plugins|                      JWPlayerOptions|                      jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup                  )                  .*?file\s*:\s*["\'](.*?)["\']''', webpage) -        if mobj is None: +        if not found:              # Broaden the search a little bit -            mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) -        if mobj is None: -            # Broaden the search a little bit: JWPlayer JS loader -            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) - -        if mobj is None: +            found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) +        if not found: +            # Broaden the findall a little bit: JWPlayer JS loader +            found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) +        if not found:              # Try to find twitter cards info -            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) -        if mobj is None: +            found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) +        if not found:              # We look for Open Graph info:              # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) -            m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) +            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:              if m_video_type is not None: -                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) -        if mobj is None: +                found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) +        if not found:              # HTML5 video -            mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) -        if mobj is None: -            mobj = re.search( +            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage) +        if not found: +            found = re.findall(                  r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'                  r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',                  webpage) -            if mobj: -                new_url = mobj.group(1) +            if found: +                new_url = found.group(1)                  self.report_following_redirect(new_url)                  return {                      '_type': 'url',                      'url': new_url,                  } -        if mobj is None: +        if not found:              raise ExtractorError('Unsupported URL: %s' % url) -        # It's possible that one of the regexes -        # matched, but returned an empty group: -        if mobj.group(1) is None: -            raise ExtractorError('Did not find a valid video URL at %s' % url) +        entries = [] +        for video_url in found: +            video_url = compat_urlparse.urljoin(url, video_url) +            video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) -        video_url = mobj.group(1) -        video_url = compat_urlparse.urljoin(url, video_url) -        video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) +            # Sometimes, jwplayer extraction will result in a YouTube URL +            if YoutubeIE.suitable(video_url): +                entries.append(self.url_result(video_url, 'Youtube')) +                continue -        # Sometimes, jwplayer extraction will result in a YouTube URL -        if YoutubeIE.suitable(video_url): -            return self.url_result(video_url, 'Youtube') +            # here's a fun little line of code for you: +            video_id = os.path.splitext(video_id)[0] -        # here's a fun little line of code for you: -        video_id = os.path.splitext(video_id)[0] +            entries.append({ +                'id': video_id, +                'url': video_url, +                'uploader': video_uploader, +                'title': video_title, +            }) + +        if len(entries) == 1: +            return entries[1] +        else: +            for num, e in enumerate(entries, start=1): +                e['title'] = '%s (%d)' % (e['title'], num) +            return { +                '_type': 'playlist', +                'entries': entries, +            } -        return { -            'id': video_id, -            'url': video_url, -            'uploader': video_uploader, -            'title': video_title, -        }  | 
