diff options
-rw-r--r-- | youtube_dl/extractor/tagesschau.py | 75 |
1 files changed, 49 insertions, 26 deletions
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index e58385c57..ccc2d476d 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import ( + determine_ext, + parse_filesize, +) class TagesschauIE(InfoExtractor): @@ -82,37 +85,54 @@ class TagesschauIE(InfoExtractor): 'xxl': {'quality': 5}, } - def _extract_formats(self, download_text): + def _extract_formats(self, download_text, media_kind): links = re.finditer( r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', download_text) formats = [] for l in links: + link_url = l.group('url') + if not link_url: + continue format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) format = { 'format_id': format_id, 'url': l.group('url'), 'format_name': l.group('name'), } - m = re.match( - r'''(?x) - Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; - (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; - (?P<vbr>[0-9]+)kbps&\#10; - Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) + title = l.group('title') + if title: + if media_kind.lower() == 'video': + m = re.match( + r'''(?x) + Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10; + (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10; + (?P<vbr>[0-9]+)kbps&\#10; + Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''', + title) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + else: + m = re.match( + r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)', + title) + if m: + format.update({ + 'format_note': '%s, %s' % (m.group('format'), m.group('note')), + 'vcodec': 'none', + 'abr': int(m.group('abr')), + }) formats.append(format) self._sort_formats(formats) return formats @@ -154,23 +174,26 @@ class TagesschauIE(InfoExtractor): title = self._html_search_regex( r'<span class="headline".*?>(.*?)</span>', webpage, 'title') - DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>' + DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>' webpage_type = self._og_search_property('type', webpage, default=None) if webpage_type == 'website': # Article entries = [] - for num, (entry_title, download_text) in enumerate(re.findall( + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( r'(?s)<p[^>]+class="infotext"[^>]*>.*?<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX, webpage)): entries.append({ 'id': display_id, 'title': '%s-%d' % (entry_title, num), - 'formats': self._extract_formats(download_text), + 'formats': self._extract_formats(download_text, media_kind), }) return self.playlist_result(entries, display_id, title) else: # Assume single video - download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') - formats = self._extract_formats(download_text) + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') + formats = self._extract_formats(download_text, media_kind) thumbnail = self._og_search_thumbnail(webpage) description = self._html_search_regex( r'(?s)<p class="teasertext">(.*?)</p>', |