aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md4
-rwxr-xr-xyoutube_dl/YoutubeDL.py4
-rw-r--r--youtube_dl/extractor/cbs.py47
-rw-r--r--youtube_dl/extractor/cbsnews.py9
-rw-r--r--youtube_dl/extractor/cbssports.py3
-rw-r--r--youtube_dl/extractor/cnn.py41
-rw-r--r--youtube_dl/extractor/cultureunplugged.py9
-rw-r--r--youtube_dl/extractor/dotsub.py20
-rw-r--r--youtube_dl/extractor/imdb.py7
-rw-r--r--youtube_dl/extractor/kaltura.py46
10 files changed, 133 insertions, 57 deletions
diff --git a/README.md b/README.md
index 952db7abb..a10aaf35c 100644
--- a/README.md
+++ b/README.md
@@ -645,7 +645,11 @@ $ youtube-dl -f 'best[filesize<50M]'
# Download best format available via direct link over HTTP/HTTPS protocol
$ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]'
+
+# Download the best video format and the best audio format without merging them
+$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s'
```
+Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name.
# VIDEO SELECTION
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index e844dc98a..0b3e3da82 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -1299,7 +1299,7 @@ class YoutubeDL(object):
for subtitle_format in subtitle:
if subtitle_format.get('url'):
subtitle_format['url'] = sanitize_url(subtitle_format['url'])
- if 'ext' not in subtitle_format:
+ if subtitle_format.get('ext') is None:
subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
if self.params.get('listsubtitles', False):
@@ -1354,7 +1354,7 @@ class YoutubeDL(object):
note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
)
# Automatically determine file extension if missing
- if 'ext' not in format:
+ if format.get('ext') is None:
format['ext'] = determine_ext(format['url']).lower()
# Automatically determine protocol if missing (useful for format
# selection purposes)
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index a23173d6f..c72ed2dbb 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -4,6 +4,7 @@ from .theplatform import ThePlatformFeedIE
from ..utils import (
int_or_none,
find_xpath_attr,
+ ExtractorError,
)
@@ -17,19 +18,6 @@ class CBSBaseIE(ThePlatformFeedIE):
}]
} if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
- def _extract_video_info(self, filter_query, video_id):
- return self._extract_feed_info(
- 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: {
- 'series': entry.get('cbs$SeriesTitle'),
- 'season_number': int_or_none(entry.get('cbs$SeasonNumber')),
- 'episode': entry.get('cbs$EpisodeTitle'),
- 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')),
- }, {
- 'StreamPack': {
- 'manifest': 'm3u',
- }
- })
-
class CBSIE(CBSBaseIE):
_VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
@@ -38,7 +26,6 @@ class CBSIE(CBSBaseIE):
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
'info_dict': {
'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
- 'display_id': 'connect-chat-feat-garth-brooks',
'ext': 'mp4',
'title': 'Connect Chat feat. Garth Brooks',
'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -47,7 +34,10 @@ class CBSIE(CBSBaseIE):
'upload_date': '20131127',
'uploader': 'CBSI-NEW',
},
- 'expected_warnings': ['Failed to download m3u8 information'],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
'_skip': 'Blocked outside the US',
}, {
'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
@@ -56,8 +46,31 @@ class CBSIE(CBSBaseIE):
'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
'only_matching': True,
}]
- TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
+
+ def _extract_video_info(self, guid):
+ path = 'dJ5BDC/media/guid/2198311517/' + guid
+ smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+ formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid)
+ for r in ('HLS&formats=M3U', 'RTMP', 'WIFI', '3G'):
+ try:
+ tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0])
+ formats.extend(tp_formats)
+ except ExtractorError:
+ continue
+ self._sort_formats(formats)
+ metadata = self._download_theplatform_metadata(path, guid)
+ info = self._parse_theplatform_metadata(metadata)
+ info.update({
+ 'id': guid,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'series': metadata.get('cbs$SeriesTitle'),
+ 'season_number': int_or_none(metadata.get('cbs$SeasonNumber')),
+ 'episode': metadata.get('cbs$EpisodeTitle'),
+ 'episode_number': int_or_none(metadata.get('cbs$EpisodeNumber')),
+ })
+ return info
def _real_extract(self, url):
content_id = self._match_id(url)
- return self._extract_video_info('byGuid=%s' % content_id, content_id)
+ return self._extract_video_info(content_id)
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 9d3b75526..4aa6917a0 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -2,13 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from .cbs import CBSBaseIE
+from .cbs import CBSIE
from ..utils import (
parse_duration,
)
-class CBSNewsIE(CBSBaseIE):
+class CBSNewsIE(CBSIE):
IE_DESC = 'CBS News'
_VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'
@@ -35,7 +35,8 @@ class CBSNewsIE(CBSBaseIE):
'ext': 'mp4',
'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
- 'upload_date': '19700101',
+ 'upload_date': '20140404',
+ 'timestamp': 1396650660,
'uploader': 'CBSI-NEW',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 205,
@@ -63,7 +64,7 @@ class CBSNewsIE(CBSBaseIE):
item = video_info['item'] if 'item' in video_info else video_info
guid = item['mpxRefId']
- return self._extract_video_info('byGuid=%s' % guid, guid)
+ return self._extract_video_info(guid)
class CBSNewsLiveVideoIE(InfoExtractor):
diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py
index 78ca44b02..bf7915626 100644
--- a/youtube_dl/extractor/cbssports.py
+++ b/youtube_dl/extractor/cbssports.py
@@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE):
}
}]
+ def _extract_video_info(self, filter_query, video_id):
+ return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
+
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_video_info('byId=%s' % video_id, video_id)
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py
index 53489a14e..220bb55e8 100644
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -11,7 +11,7 @@ from ..utils import (
class CNNIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
+ _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
(?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
_TESTS = [{
@@ -46,18 +46,45 @@ class CNNIE(InfoExtractor):
'upload_date': '20141222',
}
}, {
+ 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
+ 'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
+ 'info_dict': {
+ 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
+ 'ext': 'mp4',
+ 'title': '5 stunning stats about Netflix',
+ 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
+ 'upload_date': '20160819',
+ }
+ }, {
'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
'only_matching': True,
}, {
'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
'only_matching': True,
+ }, {
+ 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
+ 'only_matching': True,
}]
+ _CONFIG = {
+ # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
+ 'edition': {
+ 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
+ 'media_src': 'http://pmd.cdn.turner.com/cnn/big',
+ },
+ # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
+ 'money': {
+ 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
+ 'media_src': 'http://ht3.cdn.turner.com/money/big',
+ },
+ }
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- path = mobj.group('path')
- page_title = mobj.group('title')
- info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path
+ sub_domain, path, page_title = re.match(self._VALID_URL, url).groups()
+ if sub_domain not in ('money', 'edition'):
+ sub_domain = 'edition'
+ config = self._CONFIG[sub_domain]
+ info_url = config['data_src'] % path
info = self._download_xml(info_url, page_title)
formats = []
@@ -66,7 +93,7 @@ class CNNIE(InfoExtractor):
(?:_(?P<bitrate>[0-9]+)k)?
''')
for f in info.findall('files/file'):
- video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip())
+ video_url = config['media_src'] + f.text.strip()
fdct = {
'format_id': f.attrib['bitrate'],
'url': video_url,
@@ -146,7 +173,7 @@ class CNNBlogsIE(InfoExtractor):
class CNNArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
+ _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
_TEST = {
'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py
index 9c764fe68..9f26fa587 100644
--- a/youtube_dl/extractor/cultureunplugged.py
+++ b/youtube_dl/extractor/cultureunplugged.py
@@ -1,9 +1,13 @@
from __future__ import unicode_literals
import re
+import time
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ HEADRequest,
+)
class CultureUnpluggedIE(InfoExtractor):
@@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor):
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
+ # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request
+ self._request_webpage(HEADRequest(
+ 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id)
movie_data = self._download_json(
'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id)
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index e9ca236d4..fd64d1a7f 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -10,18 +10,18 @@ from ..utils import (
class DotsubIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)'
_TEST = {
- 'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
- 'md5': '0914d4d69605090f623b7ac329fea66e',
+ 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09',
+ 'md5': '21c7ff600f545358134fea762a6d42b6',
'info_dict': {
- 'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27',
+ 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09',
'ext': 'flv',
- 'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary',
- 'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074',
- 'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
- 'duration': 3169,
- 'uploader': '4v4l0n42',
- 'timestamp': 1292248482.625,
- 'upload_date': '20101213',
+ 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever',
+ 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6',
+ 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p',
+ 'duration': 198,
+ 'uploader': 'liuxt',
+ 'timestamp': 1385778501.104,
+ 'upload_date': '20131130',
'view_count': int,
}
}
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index 0acce9f4c..3a6a6f5ad 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from ..utils import (
mimetype2ext,
qualities,
+ remove_end,
)
@@ -19,7 +20,7 @@ class ImdbIE(InfoExtractor):
'info_dict': {
'id': '2524815897',
'ext': 'mp4',
- 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
+ 'title': 'Ice Age: Continental Drift Trailer (No. 2)',
'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
}
}, {
@@ -83,10 +84,10 @@ class ImdbIE(InfoExtractor):
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
+ 'title': remove_end(self._og_search_title(webpage), ' - IMDb'),
'formats': formats,
'description': descr,
- 'thumbnail': format_info['slate'],
+ 'thumbnail': format_info.get('slate'),
}
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index ddf1165ff..e0f7366c2 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -67,6 +67,27 @@ class KalturaIE(InfoExtractor):
# video with subtitles
'url': 'kaltura:111032:1_cw786r8q',
'only_matching': True,
+ },
+ {
+ # video with ttml subtitles (no fileExt)
+ 'url': 'kaltura:1926081:0_l5ye1133',
+ 'info_dict': {
+ 'id': '0_l5ye1133',
+ 'ext': 'mp4',
+ 'title': 'What Can You Do With Python?',
+ 'upload_date': '20160221',
+ 'uploader_id': 'stork',
+ 'thumbnail': 're:^https?://.*/thumbnail/.*',
+ 'timestamp': int,
+ 'subtitles': {
+ 'en': [{
+ 'ext': 'ttml',
+ }],
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}
]
@@ -122,18 +143,6 @@ class KalturaIE(InfoExtractor):
return data
- def _get_kaltura_signature(self, video_id, partner_id, service_url=None):
- actions = [{
- 'apiVersion': '3.1',
- 'expiry': 86400,
- 'format': 1,
- 'service': 'session',
- 'action': 'startWidgetSession',
- 'widgetId': '_%s' % partner_id,
- }]
- return self._kaltura_api_call(
- video_id, actions, service_url, note='Downloading Kaltura signature')['ks']
-
def _get_video_info(self, video_id, partner_id, service_url=None):
actions = [
{
@@ -208,6 +217,17 @@ class KalturaIE(InfoExtractor):
reference_id)['entryResult']
info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
entry_id = info['id']
+ # Unfortunately, data returned in kalturaIframePackageData lacks
+ # captions so we will try requesting the complete data using
+ # regular approach since we now know the entry_id
+ try:
+ _, info, flavor_assets, captions = self._get_video_info(
+ entry_id, partner_id)
+ except ExtractorError:
+ # Regular scenario failed but we already have everything
+ # extracted apart from captions and can process at least
+ # with this
+ pass
else:
raise ExtractorError('Invalid URL', expected=True)
ks = params.get('flashvars[ks]', [None])[0]
@@ -267,7 +287,7 @@ class KalturaIE(InfoExtractor):
continue
subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({
'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']),
- 'ext': caption.get('fileExt'),
+ 'ext': caption.get('fileExt', 'ttml'),
})
return {