diff options
| author | Philipp Hagemeister <phihag@phihag.de> | 2014-11-13 15:02:31 +0100 | 
|---|---|---|
| committer | Philipp Hagemeister <phihag@phihag.de> | 2014-11-13 15:02:31 +0100 | 
| commit | e4bdb37ec6c463df236cf7178046b8653c70b78e (patch) | |
| tree | 4517b09e2b910d8246a5626c10d67aa2479c9221 | |
| parent | 3e6e4999ca09352fa513c20bfb5ba7b37811819b (diff) | |
[spiegel] Add support for embeds
| -rw-r--r-- | test/test_utils.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/spiegel.py | 55 | 
2 files changed, 45 insertions, 15 deletions
diff --git a/test/test_utils.py b/test/test_utils.py index e59547784..0c11d0438 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -16,6 +16,7 @@ import json  import xml.etree.ElementTree  from youtube_dl.utils import ( +    clean_html,      DateRange,      encodeFilename,      find_xpath_attr, @@ -345,5 +346,9 @@ class TestUtil(unittest.TestCase):          on = js_to_json('{"abc": true}')          self.assertEqual(json.loads(on), {'abc': True}) +    def test_clean_html(self): +        self.assertEqual(clean_html('a:\nb'), 'a: b') +        self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"') +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 4b4da43d1..9586a7da2 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import compat_urlparse +from ..compat import compat_urlparse  class SpiegelIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:\.html)?(?:#.*)?$' +    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$'      _TESTS = [{          'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',          'md5': '2c2754212136f35fb4b19767d242f66e', @@ -29,14 +29,24 @@ class SpiegelIE(InfoExtractor):              'description': 'md5:c2322b65e58f385a820c10fa03b2d088',              'duration': 983,          }, +    }, { +        'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', +        'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', +        'info_dict': { +            'id': '1519126', +            'ext': 'mp4', +            'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', +            'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', +        }      }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex( -            r'<div class="module-title">(.*?)</div>', webpage, 'title') +        title = re.sub(r'\s+', ' ', self._html_search_regex( +            r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>', +            webpage, 'title'))          description = self._html_search_meta('description', webpage, 'description')          base_url = self._search_regex( @@ -77,7 +87,7 @@ class SpiegelArticleIE(InfoExtractor):      _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'      IE_NAME = 'Spiegel:Article'      IE_DESC = 'Articles on spiegel.de' -    _TEST = { +    _TESTS = [{          'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',          'info_dict': {              'id': '1516455', @@ -85,19 +95,34 @@ class SpiegelArticleIE(InfoExtractor):              'title': 'Faszination Badminton: Nennt es bloß nicht Federball',              'description': 're:^Patrick Kämnitz gehört.{100,}',          }, -    } +    }, { +        'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', +        'info_dict': { + +        }, +        'playlist_count': 6, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) -          webpage = self._download_webpage(url, video_id) + +        # Single video on top of the page          video_link = self._search_regex(              r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, -            'video page URL') -        video_url = compat_urlparse.urljoin( -            self.http_scheme() + '//spiegel.de/', video_link) - -        return { -            '_type': 'url', -            'url': video_url, -        } +            'video page URL', default=None) +        if video_link: +            video_url = compat_urlparse.urljoin( +                self.http_scheme() + '//spiegel.de/', video_link) +            return self.url_result(video_url) + +        # Multiple embedded videos +        embeds = re.findall( +            r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', +            webpage) +        entries = [ +            self.url_result(compat_urlparse.urljoin( +                self.http_scheme() + '//spiegel.de/', embed_path)) +            for embed_path in embeds +        ] +        return self.playlist_result(entries)  | 
