diff options
| -rwxr-xr-x | youtube_dl/InfoExtractors.py | 140 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 151 | 
2 files changed, 152 insertions, 139 deletions
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 999521feb..fcc94db2c 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -27,6 +27,7 @@ from .extractor.ard import ARDIE  from .extractor.arte import ArteTvIE  from .extractor.dailymotion import DailymotionIE  from .extractor.gametrailers import GametrailersIE +from .extractor.generic import GenericIE  from .extractor.metacafe import MetacafeIE  from .extractor.statigram import StatigramIE  from .extractor.photobucket import PhotobucketIE @@ -45,145 +46,6 @@ from .extractor.zdf import ZDFIE -class GenericIE(InfoExtractor): -    """Generic last-resort information extractor.""" - -    _VALID_URL = r'.*' -    IE_NAME = u'generic' - -    def report_download_webpage(self, video_id): -        """Report webpage download.""" -        if not self._downloader.params.get('test', False): -            self._downloader.report_warning(u'Falling back on generic information extractor.') -        super(GenericIE, self).report_download_webpage(video_id) - -    def report_following_redirect(self, new_url): -        """Report information extraction.""" -        self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) - -    def _test_redirect(self, url): -        """Check if it is a redirect, like url shorteners, in case return the new url.""" -        class HeadRequest(compat_urllib_request.Request): -            def get_method(self): -                return "HEAD" - -        class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): -            """ -            Subclass the HTTPRedirectHandler to make it use our -            HeadRequest also on the redirected URL -            """ -            def redirect_request(self, req, fp, code, msg, headers, newurl): -                if code in (301, 302, 303, 307): -                    newurl = newurl.replace(' ', '%20') -                    newheaders = dict((k,v) for k,v in req.headers.items() -                                      if k.lower() not in ("content-length", "content-type")) -                    return HeadRequest(newurl, -                                       headers=newheaders, -                                       origin_req_host=req.get_origin_req_host(), -                                       unverifiable=True) -                else: -                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - -        class HTTPMethodFallback(compat_urllib_request.BaseHandler): -            """ -            Fallback to GET if HEAD is not allowed (405 HTTP error) -            """ -            def http_error_405(self, req, fp, code, msg, headers): -                fp.read() -                fp.close() - -                newheaders = dict((k,v) for k,v in req.headers.items() -                                  if k.lower() not in ("content-length", "content-type")) -                return self.parent.open(compat_urllib_request.Request(req.get_full_url(), -                                                 headers=newheaders, -                                                 origin_req_host=req.get_origin_req_host(), -                                                 unverifiable=True)) - -        # Build our opener -        opener = compat_urllib_request.OpenerDirector() -        for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, -                        HTTPMethodFallback, HEADRedirectHandler, -                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: -            opener.add_handler(handler()) - -        response = opener.open(HeadRequest(url)) -        if response is None: -            raise ExtractorError(u'Invalid URL protocol') -        new_url = response.geturl() - -        if url == new_url: -            return False - -        self.report_following_redirect(new_url) -        return new_url - -    def _real_extract(self, url): -        new_url = self._test_redirect(url) -        if new_url: return [self.url_result(new_url)] - -        video_id = url.split('/')[-1] -        try: -            webpage = self._download_webpage(url, video_id) -        except ValueError as err: -            # since this is the last-resort InfoExtractor, if -            # this error is thrown, it'll be thrown here -            raise ExtractorError(u'Invalid URL: %s' % url) - -        self.report_extraction(video_id) -        # Start with something easy: JW Player in SWFObject -        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) -        if mobj is None: -            # Broaden the search a little bit -            mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) -        if mobj is None: -            # Broaden the search a little bit: JWPlayer JS loader -            mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) -        if mobj is None: -            # Try to find twitter cards info -            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) -        if mobj is None: -            # We look for Open Graph info: -            # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) -            m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) -            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: -            if m_video_type is not None: -                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) - -        # It's possible that one of the regexes -        # matched, but returned an empty group: -        if mobj.group(1) is None: -            raise ExtractorError(u'Invalid URL: %s' % url) - -        video_url = compat_urllib_parse.unquote(mobj.group(1)) -        video_id = os.path.basename(video_url) - -        # here's a fun little line of code for you: -        video_extension = os.path.splitext(video_id)[1][1:] -        video_id = os.path.splitext(video_id)[0] - -        # it's tempting to parse this further, but you would -        # have to take into account all the variations like -        #   Video Title - Site Name -        #   Site Name | Video Title -        #   Video Title - Tagline | Site Name -        # and so on and so forth; it's just not practical -        video_title = self._html_search_regex(r'<title>(.*)</title>', -            webpage, u'video title') - -        # video uploader is domain name -        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', -            url, u'video uploader') - -        return [{ -            'id':       video_id, -            'url':      video_url, -            'uploader': video_uploader, -            'upload_date':  None, -            'title':    video_title, -            'ext':      video_extension, -        }] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py new file mode 100644 index 000000000..7a877b3bc --- /dev/null +++ b/youtube_dl/extractor/generic.py @@ -0,0 +1,151 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_error, +    compat_urllib_parse, +    compat_urllib_request, + +    ExtractorError, +) + +class GenericIE(InfoExtractor): +    """Generic last-resort information extractor.""" + +    _VALID_URL = r'.*' +    IE_NAME = u'generic' + +    def report_download_webpage(self, video_id): +        """Report webpage download.""" +        if not self._downloader.params.get('test', False): +            self._downloader.report_warning(u'Falling back on generic information extractor.') +        super(GenericIE, self).report_download_webpage(video_id) + +    def report_following_redirect(self, new_url): +        """Report information extraction.""" +        self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) + +    def _test_redirect(self, url): +        """Check if it is a redirect, like url shorteners, in case return the new url.""" +        class HeadRequest(compat_urllib_request.Request): +            def get_method(self): +                return "HEAD" + +        class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): +            """ +            Subclass the HTTPRedirectHandler to make it use our +            HeadRequest also on the redirected URL +            """ +            def redirect_request(self, req, fp, code, msg, headers, newurl): +                if code in (301, 302, 303, 307): +                    newurl = newurl.replace(' ', '%20') +                    newheaders = dict((k,v) for k,v in req.headers.items() +                                      if k.lower() not in ("content-length", "content-type")) +                    return HeadRequest(newurl, +                                       headers=newheaders, +                                       origin_req_host=req.get_origin_req_host(), +                                       unverifiable=True) +                else: +                    raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) + +        class HTTPMethodFallback(compat_urllib_request.BaseHandler): +            """ +            Fallback to GET if HEAD is not allowed (405 HTTP error) +            """ +            def http_error_405(self, req, fp, code, msg, headers): +                fp.read() +                fp.close() + +                newheaders = dict((k,v) for k,v in req.headers.items() +                                  if k.lower() not in ("content-length", "content-type")) +                return self.parent.open(compat_urllib_request.Request(req.get_full_url(), +                                                 headers=newheaders, +                                                 origin_req_host=req.get_origin_req_host(), +                                                 unverifiable=True)) + +        # Build our opener +        opener = compat_urllib_request.OpenerDirector() +        for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, +                        HTTPMethodFallback, HEADRedirectHandler, +                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: +            opener.add_handler(handler()) + +        response = opener.open(HeadRequest(url)) +        if response is None: +            raise ExtractorError(u'Invalid URL protocol') +        new_url = response.geturl() + +        if url == new_url: +            return False + +        self.report_following_redirect(new_url) +        return new_url + +    def _real_extract(self, url): +        new_url = self._test_redirect(url) +        if new_url: return [self.url_result(new_url)] + +        video_id = url.split('/')[-1] +        try: +            webpage = self._download_webpage(url, video_id) +        except ValueError: +            # since this is the last-resort InfoExtractor, if +            # this error is thrown, it'll be thrown here +            raise ExtractorError(u'Invalid URL: %s' % url) + +        self.report_extraction(video_id) +        # Start with something easy: JW Player in SWFObject +        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) +        if mobj is None: +            # Broaden the search a little bit +            mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) +        if mobj is None: +            # Broaden the search a little bit: JWPlayer JS loader +            mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) +        if mobj is None: +            # Try to find twitter cards info +            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) +        if mobj is None: +            # We look for Open Graph info: +            # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) +            m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) +            # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: +            if m_video_type is not None: +                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) +        if mobj is None: +            raise ExtractorError(u'Invalid URL: %s' % url) + +        # It's possible that one of the regexes +        # matched, but returned an empty group: +        if mobj.group(1) is None: +            raise ExtractorError(u'Invalid URL: %s' % url) + +        video_url = compat_urllib_parse.unquote(mobj.group(1)) +        video_id = os.path.basename(video_url) + +        # here's a fun little line of code for you: +        video_extension = os.path.splitext(video_id)[1][1:] +        video_id = os.path.splitext(video_id)[0] + +        # it's tempting to parse this further, but you would +        # have to take into account all the variations like +        #   Video Title - Site Name +        #   Site Name | Video Title +        #   Video Title - Tagline | Site Name +        # and so on and so forth; it's just not practical +        video_title = self._html_search_regex(r'<title>(.*)</title>', +            webpage, u'video title') + +        # video uploader is domain name +        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', +            url, u'video uploader') + +        return [{ +            'id':       video_id, +            'url':      video_url, +            'uploader': video_uploader, +            'upload_date':  None, +            'title':    video_title, +            'ext':      video_extension, +        }]  | 
