diff options
| -rw-r--r-- | test/helper.py | 4 | ||||
| -rw-r--r-- | test/test_download.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 202 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 72 | ||||
| -rw-r--r-- | youtube_dl/extractor/videolecturesnet.py | 2 | 
5 files changed, 228 insertions, 56 deletions
diff --git a/test/helper.py b/test/helper.py index e1129e58f..c8b34654d 100644 --- a/test/helper.py +++ b/test/helper.py @@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict):              elif isinstance(expected, compat_str) and expected.startswith('mincount:'):                  got = got_dict.get(info_field)                  self.assertTrue( -                    isinstance(got, list), -                    'Expected field %s to be a list, but it is of type %s' % ( +                    isinstance(got, (list, dict)), +                    'Expected field %s to be a list or a dict, but it is of type %s' % (                          info_field, type(got).__name__))                  expected_num = int(expected.partition(':')[2])                  assertGreaterEqual( diff --git a/test/test_download.py b/test/test_download.py index 1110357a7..284418834 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -136,7 +136,9 @@ def generator(test_case):                      # We're not using .download here sine that is just a shim                      # for outside error handling, and returns the exit code                      # instead of the result dict. -                    res_dict = ydl.extract_info(test_case['url']) +                    res_dict = ydl.extract_info( +                        test_case['url'], +                        force_generic_extractor=params.get('force_generic_extractor', False))                  except (DownloadError, ExtractorError) as err:                      # Check if the exception is not a network related one                      if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 507ea5ec0..def6caa0d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,6 +18,7 @@ from ..compat import (      compat_HTTPError,      compat_http_client,      compat_urllib_error, +    compat_urllib_parse,      compat_urllib_parse_urlparse,      compat_urllib_request,      compat_urlparse, @@ -37,6 +38,7 @@ from ..utils import (      RegexNotFoundError,      sanitize_filename,      unescapeHTML, +    url_basename,  ) @@ -978,69 +980,167 @@ class InfoExtractor(object):          self._sort_formats(formats)          return formats -    # TODO: improve extraction -    def _extract_smil_formats(self, smil_url, video_id, fatal=True): -        smil = self._download_xml( -            smil_url, video_id, 'Downloading SMIL file', -            'Unable to download SMIL file', fatal=fatal) +    @staticmethod +    def _xpath_ns(path, namespace=None): +        if not namespace: +            return path +        out = [] +        for c in path.split('/'): +            if not c or c == '.': +                out.append(c) +            else: +                out.append('{%s}%s' % (namespace, c)) +        return '/'.join(out) + +    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal) +          if smil is False:              assert not fatal              return [] -        base = smil.find('./head/meta').get('base') +        namespace = self._parse_smil_namespace(smil) + +        return self._parse_smil_formats( +            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + +    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal) +        if smil is False: +            return {} +        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + +    def _download_smil(self, smil_url, video_id, fatal=True): +        return self._download_xml( +            smil_url, video_id, 'Downloading SMIL file', +            'Unable to download SMIL file', fatal=fatal) + +    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): +        namespace = self._parse_smil_namespace(smil) + +        formats = self._parse_smil_formats( +            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) +        subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + +        video_id = os.path.splitext(url_basename(smil_url))[0] +        title = None +        description = None +        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): +            name = meta.attrib.get('name') +            content = meta.attrib.get('content') +            if not name or not content: +                continue +            if not title and name == 'title': +                title = content +            elif not description and name in ('description', 'abstract'): +                description = content + +        return { +            'id': video_id, +            'title': title or video_id, +            'description': description, +            'formats': formats, +            'subtitles': subtitles, +        } + +    def _parse_smil_namespace(self, smil): +        return self._search_regex( +            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + +    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): +        base = smil_url +        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): +            b = meta.get('base') or meta.get('httpBase') +            if b: +                base = b +                break          formats = []          rtmp_count = 0 -        if smil.findall('./body/seq/video'): -            video = smil.findall('./body/seq/video')[0] -            fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) -            formats.extend(fmts) -        else: -            for video in smil.findall('./body/switch/video'): -                fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) -                formats.extend(fmts) +        http_count = 0 + +        videos = smil.findall(self._xpath_ns('.//video', namespace)) +        for video in videos: +            src = video.get('src') +            if not src: +                continue + +            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) +            filesize = int_or_none(video.get('size') or video.get('fileSize')) +            width = int_or_none(video.get('width')) +            height = int_or_none(video.get('height')) +            proto = video.get('proto') +            ext = video.get('ext') +            src_ext = determine_ext(src) +            streamer = video.get('streamer') or base + +            if proto == 'rtmp' or streamer.startswith('rtmp'): +                rtmp_count += 1 +                formats.append({ +                    'url': streamer, +                    'play_path': src, +                    'ext': 'flv', +                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), +                    'tbr': bitrate, +                    'filesize': filesize, +                    'width': width, +                    'height': height, +                }) +                continue + +            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + +            if proto == 'm3u8' or src_ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    src_url, video_id, ext or 'mp4', m3u8_id='hls')) +                continue + +            if src_ext == 'f4m': +                f4m_url = src_url +                if not f4m_params: +                    f4m_params = { +                        'hdcore': '3.2.0', +                        'plugin': 'flowplayer-3.2.0.1', +                    } +                f4m_url += '&' if '?' in f4m_url else '?' +                f4m_url += compat_urllib_parse.urlencode(f4m_params) +                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) +                continue + +            if src_url.startswith('http'): +                http_count += 1 +                formats.append({ +                    'url': src_url, +                    'ext': ext or src_ext or 'flv', +                    'format_id': 'http-%d' % (bitrate or http_count), +                    'tbr': bitrate, +                    'filesize': filesize, +                    'width': width, +                    'height': height, +                }) +                continue          self._sort_formats(formats)          return formats -    def _parse_smil_video(self, video, video_id, base, rtmp_count): -        src = video.get('src') -        if not src: -            return [], rtmp_count -        bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) -        width = int_or_none(video.get('width')) -        height = int_or_none(video.get('height')) -        proto = video.get('proto') -        if not proto: -            if base: -                if base.startswith('rtmp'): -                    proto = 'rtmp' -                elif base.startswith('http'): -                    proto = 'http' -        ext = video.get('ext') -        if proto == 'm3u8': -            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count -        elif proto == 'rtmp': -            rtmp_count += 1 -            streamer = video.get('streamer') or base -            return ([{ -                'url': streamer, -                'play_path': src, -                'ext': 'flv', -                'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), -                'tbr': bitrate, -                'width': width, -                'height': height, -            }], rtmp_count) -        elif proto.startswith('http'): -            return ([{ -                'url': base + src, -                'ext': ext or 'flv', -                'tbr': bitrate, -                'width': width, -                'height': height, -            }], rtmp_count) +    def _parse_smil_subtitles(self, smil, namespace=None): +        subtitles = {} +        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): +            src = textstream.get('src') +            if not src: +                continue +            ext = textstream.get('ext') or determine_ext(src) +            if not ext: +                type_ = textstream.get('type') +                if type_ == 'text/srt': +                    ext = 'srt' +            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') +            subtitles.setdefault(lang, []).append({ +                'url': src, +                'ext': ext, +            }) +        return subtitles      def _live_title(self, name):          """ Generate the title for a live video """ diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 469909a51..901f77304 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -130,6 +130,74 @@ class GenericIE(InfoExtractor):                  'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',              }          }, +        # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng +        { +            'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', +            'info_dict': { +                'id': 'smil', +                'ext': 'mp4', +                'title': 'Automatics, robotics and biocybernetics', +                'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', +                'formats': 'mincount:16', +                'subtitles': 'mincount:1', +            }, +            'params': { +                'force_generic_extractor': True, +                'skip_download': True, +            }, +        }, +        # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html +        { +            'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', +            'info_dict': { +                'id': 'hds', +                'ext': 'flv', +                'title': 'hds', +                'formats': 'mincount:1', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        # SMIL from https://www.restudy.dk/video/play/id/1637 +        { +            'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', +            'info_dict': { +                'id': 'video_1637', +                'ext': 'flv', +                'title': 'video_1637', +                'formats': 'mincount:3', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm +        { +            'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', +            'info_dict': { +                'id': 'smil-service', +                'ext': 'flv', +                'title': 'smil-service', +                'formats': 'mincount:1', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 +        { +            'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', +            'info_dict': { +                'id': '4719370', +                'ext': 'mp4', +                'title': '571de1fd-47bc-48db-abf9-238872a58d1f', +                'formats': 'mincount:3', +            }, +            'params': { +                'skip_download': True, +            }, +        },          # google redirect          {              'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -1123,11 +1191,13 @@ class GenericIE(InfoExtractor):          self.report_extraction(video_id) -        # Is it an RSS feed? +        # Is it an RSS feed or a SMIL file?          try:              doc = parse_xml(webpage)              if doc.tag == 'rss':                  return self._extract_rss(url, video_id, doc) +            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): +                return self._parse_smil(doc, url, video_id)          except compat_xml_parse_error:              pass diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index d6a7eb203..24584dc80 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -12,7 +12,7 @@ from ..utils import (  class VideoLecturesNetIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/' +    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$'      IE_NAME = 'videolectures.net'      _TEST = {  | 
