diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/bbc.py | 19 | ||||
-rw-r--r-- | youtube_dl/extractor/cbsnews.py | 47 | ||||
-rw-r--r-- | youtube_dl/extractor/common.py | 32 | ||||
-rw-r--r-- | youtube_dl/extractor/ndr.py | 2 | ||||
-rw-r--r-- | youtube_dl/extractor/odnoklassniki.py | 8 | ||||
-rw-r--r-- | youtube_dl/extractor/spankbang.py | 4 |
6 files changed, 72 insertions, 40 deletions
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 1c493b72d..6ddee686c 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -194,6 +194,19 @@ class BBCCoUkIE(InfoExtractor): 'skip_download': True, }, }, { + # compact player (https://github.com/rg3/youtube-dl/issues/8147) + 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', + 'info_dict': { + 'id': 'p028bfkj', + 'ext': 'flv', + 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, }, { @@ -482,9 +495,11 @@ class BBCCoUkIE(InfoExtractor): if programme_id: formats, subtitles = self._download_media_selector(programme_id) title = self._og_search_title(webpage, default=None) or self._html_search_regex( - r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', webpage, 'title') + (r'<h2[^>]+id="parent-title"[^>]*>(.+?)</h2>', + r'<div[^>]+class="info"[^>]*>\s*<h1>(.+?)</h1>'), webpage, 'title') description = self._search_regex( - r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + (r'<p class="[^"]*medium-description[^"]*">([^<]+)</p>', + r'<div[^>]+class="info_+synopsis"[^>]*>([^<]+)</div>'), webpage, 'description', default=None) if not description: description = self._html_search_meta('description', webpage) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 480435e26..cabf7e73b 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -4,11 +4,10 @@ from __future__ import unicode_literals import re import json -from .common import InfoExtractor -from ..utils import remove_start +from .theplatform import ThePlatformIE -class CBSNewsIE(InfoExtractor): +class CBSNewsIE(ThePlatformIE): IE_DESC = 'CBS News' _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)' @@ -31,7 +30,7 @@ class CBSNewsIE(InfoExtractor): 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, @@ -42,7 +41,7 @@ class CBSNewsIE(InfoExtractor): }, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, @@ -63,33 +62,6 @@ class CBSNewsIE(InfoExtractor): duration = item.get('duration') thumbnail = item.get('mediaImage') or item.get('thumbnail') - formats = [] - for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: - uri = item.get('media' + format_id + 'URI') - if not uri: - continue - uri = remove_start(uri, '{manifest:none}') - fmt = { - 'url': uri, - 'format_id': format_id, - } - if uri.startswith('rtmp'): - play_path = re.sub( - r'{slistFilePath}', '', - uri.split('<break>')[-1].split('{break}')[-1]) - play_path = re.sub( - r'{manifest:.+}.*$', '', play_path) - fmt.update({ - 'app': 'ondemand?auth=cbs', - 'play_path': 'mp4:' + play_path, - 'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf', - 'page_url': 'http://www.cbsnews.com', - 'ext': 'flv', - }) - elif uri.endswith('.m3u8'): - fmt['ext'] = 'mp4' - formats.append(fmt) - subtitles = {} if 'mpxRefId' in video_info: subtitles['en'] = [{ @@ -97,6 +69,17 @@ class CBSNewsIE(InfoExtractor): 'url': 'http://www.cbsnews.com/videos/captions/%s.adb_xml' % video_info['mpxRefId'], }] + formats = [] + for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: + pid = item.get('media' + format_id) + if not pid: + continue + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2f574054d..33290fd74 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -825,6 +825,12 @@ class InfoExtractor(object): if not formats: raise ExtractorError('No video formats found') + for f in formats: + # Automatically determine tbr when missing based on abr and vbr (improves + # formats sorting in some cases) + if 'tbr' not in f and 'abr' in f and 'vbr' in f: + f['tbr'] = f['abr'] + f['vbr'] + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext @@ -1014,6 +1020,18 @@ class InfoExtractor(object): return [] m3u8_doc, urlh = res m3u8_url = urlh.geturl() + # A Media Playlist Tag MUST NOT appear in a Master Playlist + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 + # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists + # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + if '#EXT-X-TARGETDURATION' in m3u8_doc: + return [{ + 'url': m3u8_url, + 'format_id': m3u8_id, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + }] last_info = None last_media = None kv_rex = re.compile( @@ -1164,6 +1182,7 @@ class InfoExtractor(object): formats = [] rtmp_count = 0 http_count = 0 + m3u8_count = 0 videos = smil.findall(self._xpath_ns('.//video', namespace)) for video in videos: @@ -1203,8 +1222,17 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if len(m3u8_formats) == 1: + m3u8_count += 1 + m3u8_formats[0].update({ + 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + formats.extend(m3u8_formats) continue if src_ext == 'f4m': diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 894c51399..0cded6b5c 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -193,7 +193,7 @@ class NDREmbedBaseIE(InfoExtractor): src + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, f4m_id='hds')) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, m3u8_id='hls', entry_protocol='m3u8_native')) + src, video_id, 'mp4', m3u8_id='hls', entry_protocol='m3u8_native')) else: quality = f.get('quality') ff = { diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 184c7a323..f9e064a60 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -13,7 +13,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)' + _VALID_URL = r'https?://(?:(?:www|m|mobile)\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P<id>[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -69,6 +69,12 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://www.ok.ru/videoembed/20648036891', 'only_matching': True, + }, { + 'url': 'http://m.ok.ru/video/20079905452', + 'only_matching': True, + }, { + 'url': 'http://mobile.ok.ru/video/20079905452', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 7f060b15b..3cfa671ed 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -34,11 +34,11 @@ class SpankBangIE(InfoExtractor): 'ext': 'mp4', 'format_id': '%sp' % height, 'height': int(height), - } for height in re.findall(r'<span[^>]+q_(\d+)p', webpage)] + } for height in re.findall(r'<(?:span|li)[^>]+q_(\d+)p', webpage)] self._sort_formats(formats) title = self._html_search_regex( - r'(?s)<h1>(.+?)</h1>', webpage, 'title') + r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') description = self._search_regex( r'class="desc"[^>]*>([^<]+)', webpage, 'description', default=None) |