aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/common.py19
-rw-r--r--youtube_dl/extractor/heise.py16
2 files changed, 21 insertions, 14 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 9e1d62c2b..b77f0e519 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -404,7 +404,7 @@ class InfoExtractor(object):
video_info['title'] = playlist_title
return video_info
- def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Perform a regex search on the given string, using a single or a list of
patterns returning the first matching group.
@@ -425,8 +425,11 @@ class InfoExtractor(object):
_name = name
if mobj:
- # return the first matching group
- return next(g for g in mobj.groups() if g is not None)
+ if group is None:
+ # return the first matching group
+ return next(g for g in mobj.groups() if g is not None)
+ else:
+ return mobj.group(group)
elif default is not _NO_DEFAULT:
return default
elif fatal:
@@ -436,11 +439,11 @@ class InfoExtractor(object):
'please report this issue on http://yt-dl.org/bug' % _name)
return None
- def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
+ def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
- res = self._search_regex(pattern, string, name, default, fatal, flags)
+ res = self._search_regex(pattern, string, name, default, fatal, flags, group)
if res:
return clean_html(res).strip()
else:
@@ -534,9 +537,9 @@ class InfoExtractor(object):
display_name = name
return self._html_search_regex(
r'''(?ix)<meta
- (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
- [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
- html, display_name, fatal=fatal, **kwargs)
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
+ html, display_name, fatal=fatal, group='content', **kwargs)
def _dc_search_uploader(self, html):
return self._html_search_meta('dc.creator', html, 'uploader')
diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py
index d41c0413f..278d9f527 100644
--- a/youtube_dl/extractor/heise.py
+++ b/youtube_dl/extractor/heise.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- get_meta_content,
+ determine_ext,
int_or_none,
parse_iso8601,
)
@@ -25,11 +25,11 @@ class HeiseIE(InfoExtractor):
'title': (
"Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone"
),
- 'format_id': 'mp4_720',
+ 'format_id': 'mp4_720p',
'timestamp': 1411812600,
'upload_date': '20140927',
'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.',
- 'thumbnail': 're:https?://.*\.jpg$',
+ 'thumbnail': 're:^https?://.*\.jpe?g$',
}
}
@@ -49,11 +49,12 @@ class HeiseIE(InfoExtractor):
info = {
'id': video_id,
'thumbnail': self._og_search_thumbnail(webpage),
- 'timestamp': parse_iso8601(get_meta_content('date', webpage)),
+ 'timestamp': parse_iso8601(
+ self._html_search_meta('date', webpage)),
'description': self._og_search_description(webpage),
}
- title = get_meta_content('fulltitle', webpage)
+ title = self._html_search_meta('fulltitle', webpage)
if title:
info['title'] = title
else:
@@ -64,9 +65,12 @@ class HeiseIE(InfoExtractor):
label = source_node.attrib['label']
height = int_or_none(self._search_regex(
r'^(.*?_)?([0-9]+)p$', label, 'height', default=None))
+ video_url = source_node.attrib['file']
+ ext = determine_ext(video_url, '')
formats.append({
- 'url': source_node.attrib['file'],
+ 'url': video_url,
'format_note': label,
+ 'format_id': '%s_%s' % (ext, label),
'height': height,
})
self._sort_formats(formats)