aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py4
-rw-r--r--youtube_dl/downloader/external.py17
-rw-r--r--youtube_dl/downloader/fragment.py4
-rw-r--r--youtube_dl/extractor/adobepass.py11
-rw-r--r--youtube_dl/extractor/adultswim.py281
-rw-r--r--youtube_dl/extractor/aenetworks.py14
-rw-r--r--youtube_dl/extractor/afreecatv.py3
-rw-r--r--youtube_dl/extractor/aljazeera.py9
-rw-r--r--youtube_dl/extractor/amp.py18
-rw-r--r--youtube_dl/extractor/anvato.py66
-rw-r--r--youtube_dl/extractor/appleconnect.py4
-rw-r--r--youtube_dl/extractor/appletrailers.py5
-rw-r--r--youtube_dl/extractor/archiveorg.py4
-rw-r--r--youtube_dl/extractor/atresplayer.py2
-rw-r--r--youtube_dl/extractor/audioboom.py2
-rw-r--r--youtube_dl/extractor/bandcamp.py12
-rw-r--r--youtube_dl/extractor/beeg.py2
-rw-r--r--youtube_dl/extractor/bilibili.py5
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/br.py2
-rw-r--r--youtube_dl/extractor/brightcove.py22
-rw-r--r--youtube_dl/extractor/canalc2.py5
-rw-r--r--youtube_dl/extractor/cbc.py6
-rw-r--r--youtube_dl/extractor/cbslocal.py4
-rwxr-xr-xyoutube_dl/extractor/cda.py52
-rw-r--r--youtube_dl/extractor/clipfish.py2
-rw-r--r--youtube_dl/extractor/collegerama.py3
-rw-r--r--youtube_dl/extractor/common.py32
-rw-r--r--youtube_dl/extractor/condenast.py81
-rw-r--r--youtube_dl/extractor/coub.py5
-rw-r--r--youtube_dl/extractor/crackle.py7
-rw-r--r--youtube_dl/extractor/crunchyroll.py4
-rw-r--r--youtube_dl/extractor/cspan.py15
-rw-r--r--youtube_dl/extractor/dailymail.py12
-rw-r--r--youtube_dl/extractor/dailymotion.py126
-rw-r--r--youtube_dl/extractor/democracynow.py3
-rw-r--r--youtube_dl/extractor/dotsub.py2
-rw-r--r--youtube_dl/extractor/douyutv.py86
-rw-r--r--youtube_dl/extractor/drtv.py34
-rw-r--r--youtube_dl/extractor/extractors.py21
-rw-r--r--youtube_dl/extractor/foxsports.py9
-rw-r--r--youtube_dl/extractor/francetv.py218
-rw-r--r--youtube_dl/extractor/funimation.py66
-rw-r--r--youtube_dl/extractor/funnyordie.py3
-rw-r--r--youtube_dl/extractor/gamespot.py3
-rw-r--r--youtube_dl/extractor/gdcvault.py15
-rw-r--r--youtube_dl/extractor/generic.py140
-rw-r--r--youtube_dl/extractor/go.py49
-rw-r--r--youtube_dl/extractor/hitbox.py52
-rw-r--r--youtube_dl/extractor/imdb.py5
-rw-r--r--youtube_dl/extractor/infoq.py4
-rw-r--r--youtube_dl/extractor/laola1tv.py97
-rw-r--r--youtube_dl/extractor/leeco.py111
-rw-r--r--youtube_dl/extractor/lego.py2
-rw-r--r--youtube_dl/extractor/liveleak.py83
-rw-r--r--youtube_dl/extractor/mediaset.py118
-rw-r--r--youtube_dl/extractor/mitele.py6
-rw-r--r--youtube_dl/extractor/myspace.py100
-rw-r--r--youtube_dl/extractor/nbc.py98
-rw-r--r--youtube_dl/extractor/njpwworld.py22
-rw-r--r--youtube_dl/extractor/nonktube.py33
-rw-r--r--youtube_dl/extractor/noovo.py97
-rw-r--r--youtube_dl/extractor/nrk.py25
-rw-r--r--youtube_dl/extractor/nuevo.py5
-rw-r--r--youtube_dl/extractor/orf.py109
-rw-r--r--youtube_dl/extractor/packtpub.py39
-rw-r--r--youtube_dl/extractor/pbs.py31
-rw-r--r--youtube_dl/extractor/pornhub.py5
-rw-r--r--youtube_dl/extractor/r7.py3
-rw-r--r--youtube_dl/extractor/rmcdecouverte.py26
-rw-r--r--youtube_dl/extractor/streamcz.py13
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/theplatform.py21
-rw-r--r--youtube_dl/extractor/thescene.py36
-rw-r--r--youtube_dl/extractor/toggle.py8
-rw-r--r--youtube_dl/extractor/toypics.py41
-rw-r--r--youtube_dl/extractor/turner.py9
-rw-r--r--youtube_dl/extractor/tvp.py3
-rw-r--r--youtube_dl/extractor/tvplayer.py35
-rw-r--r--youtube_dl/extractor/upskill.py176
-rw-r--r--youtube_dl/extractor/vevo.py17
-rw-r--r--youtube_dl/extractor/vice.py154
-rw-r--r--youtube_dl/extractor/viceland.py11
-rw-r--r--youtube_dl/extractor/videopress.py9
-rw-r--r--youtube_dl/extractor/vier.py116
-rw-r--r--youtube_dl/extractor/viewster.py3
-rw-r--r--youtube_dl/extractor/vrv.py47
-rw-r--r--youtube_dl/extractor/washingtonpost.py6
-rw-r--r--youtube_dl/extractor/wistia.py22
-rw-r--r--youtube_dl/extractor/xtube.py23
-rw-r--r--youtube_dl/extractor/xvideos.py11
-rw-r--r--youtube_dl/extractor/yandexmusic.py3
-rw-r--r--youtube_dl/extractor/youtube.py253
-rw-r--r--youtube_dl/extractor/zaq1.py101
-rw-r--r--youtube_dl/jsinterp.py38
-rw-r--r--youtube_dl/options.py9
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py31
-rw-r--r--youtube_dl/postprocessor/metadatafromtitle.py4
-rw-r--r--youtube_dl/utils.py78
-rw-r--r--youtube_dl/version.py2
100 files changed, 2591 insertions, 1262 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index eb465c425..4c33d494a 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -370,10 +370,10 @@ class YoutubeDL(object):
else:
raise
- if (sys.version_info >= (3,) and sys.platform != 'win32' and
+ if (sys.platform != 'win32' and
sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
not params.get('restrictfilenames', False)):
- # On Python 3, the Unicode filesystem API will throw errors (#1474)
+ # Unicode filesystem API will throw errors (#1474, #13027)
self.report_warning(
'Assuming --restrict-filenames since file system encoding '
'cannot encode all characters. '
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py
index e13cf547d..db018fa89 100644
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@@ -29,7 +29,17 @@ class ExternalFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
- retval = self._call_downloader(tmpfilename, info_dict)
+ try:
+ retval = self._call_downloader(tmpfilename, info_dict)
+ except KeyboardInterrupt:
+ if not info_dict.get('is_live'):
+ raise
+ # Live stream downloading cancellation should be considered as
+ # correct and expected termination thus all postprocessing
+ # should take place
+ retval = 0
+ self.to_screen('[%s] Interrupted by user' % self.get_basename())
+
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
@@ -202,6 +212,11 @@ class FFmpegFD(ExternalFD):
args = [ffpp.executable, '-y']
+ for log_level in ('quiet', 'verbose'):
+ if self.params.get(log_level, False):
+ args += ['-loglevel', log_level]
+ break
+
seekable = info_dict.get('_seekable')
if seekable is not None:
# setting -seekable prevents ffmpeg from guessing if the server
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
index bb9e82578..bccc8ecc1 100644
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@@ -49,7 +49,7 @@ class FragmentFD(FileDownloader):
index: 0-based index of current fragment among all fragments
fragment_count:
Total count of fragments
-
+
This feature is experimental and file format may change in future.
"""
@@ -155,8 +155,6 @@ class FragmentFD(FileDownloader):
self._write_ytdl_file(ctx)
if ctx['fragment_index'] > 0:
assert resume_len > 0
- else:
- assert resume_len == 0
dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode)
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py
index 100cf997f..d57ad85c2 100644
--- a/youtube_dl/extractor/adobepass.py
+++ b/youtube_dl/extractor/adobepass.py
@@ -36,6 +36,11 @@ MSO_INFO = {
'username_field': 'Ecom_User_ID',
'password_field': 'Ecom_Password',
},
+ 'Brighthouse': {
+ 'name': 'Bright House Networks | Spectrum',
+ 'username_field': 'j_username',
+ 'password_field': 'j_password',
+ },
'Charter_Direct': {
'name': 'Charter Spectrum',
'username_field': 'IDToken1',
@@ -1308,6 +1313,12 @@ class AdobePassIE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MVPD_CACHE = 'ap-mvpd'
+ def _download_webpage_handle(self, *args, **kwargs):
+ headers = kwargs.get('headers', {})
+ headers.update(self.geo_verification_headers())
+ kwargs['headers'] = headers
+ return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs)
+
@staticmethod
def _get_mvpd_resource(provider_id, title, guid, rating):
channel = etree.Element('channel')
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 989505c82..acc4ce38d 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -5,91 +5,52 @@ import re
from .turner import TurnerBaseIE
from ..utils import (
- ExtractorError,
int_or_none,
+ strip_or_none,
)
class AdultSwimIE(TurnerBaseIE):
- _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?'
+ _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
_TESTS = [{
'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
- 'playlist': [
- {
- 'md5': '247572debc75c7652f253c8daa51a14d',
- 'info_dict': {
- 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0',
- 'ext': 'flv',
- 'title': 'Rick and Morty - Pilot Part 1',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- },
- },
- {
- 'md5': '77b0e037a4b20ec6b98671c4c379f48d',
- 'info_dict': {
- 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3',
- 'ext': 'flv',
- 'title': 'Rick and Morty - Pilot Part 4',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- },
- },
- ],
'info_dict': {
'id': 'rQxZvXQ4ROaSOqq-or2Mow',
+ 'ext': 'mp4',
'title': 'Rick and Morty - Pilot',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
+ 'timestamp': 1493267400,
+ 'upload_date': '20170427',
},
- 'skip': 'This video is only available for registered users',
- }, {
- 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
- 'playlist': [
- {
- 'md5': '2eb5c06d0f9a1539da3718d897f13ec5',
- 'info_dict': {
- 'id': '-t8CamQlQ2aYZ49ItZCFog-0',
- 'ext': 'flv',
- 'title': 'American Dad - Putting Francine Out of Business',
- 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
- },
- }
- ],
- 'info_dict': {
- 'id': '-t8CamQlQ2aYZ49ItZCFog',
- 'title': 'American Dad - Putting Francine Out of Business',
- 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
+ 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
- 'playlist': [
- {
- 'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
- 'info_dict': {
- 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
- 'ext': 'mp4',
- 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
- 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
- },
- }
- ],
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
+ 'ext': 'mp4',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
- 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
+ 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
+ 'upload_date': '20080124',
+ 'timestamp': 1201150800,
},
'params': {
# m3u8 download
'skip_download': True,
- }
+ },
}, {
- # heroMetadata.trailer
'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
'info_dict': {
'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
'ext': 'mp4',
'title': 'Decker - Inside Decker: A New Hero',
- 'description': 'md5:c916df071d425d62d70c86d4399d3ee0',
- 'duration': 249.008,
+ 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
+ 'timestamp': 1469480460,
+ 'upload_date': '20160725',
},
'params': {
# m3u8 download
@@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE):
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
- 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/',
+ 'url': 'http://www.adultswim.com/videos/attack-on-titan',
+ 'info_dict': {
+ 'id': 'b7A69dzfRzuaXIECdxW8XQ',
+ 'title': 'Attack on Titan',
+ 'description': 'md5:6c8e003ea0777b47013e894767f5e114',
+ },
+ 'playlist_mincount': 12,
+ }, {
+ 'url': 'http://www.adultswim.com/videos/streams/williams-stream',
'info_dict': {
- 'id': 'eYiLsKVgQ6qTC6agD67Sig',
- 'title': 'Toonami - Friday, October 14th, 2016',
- 'description': 'md5:99892c96ffc85e159a428de85c30acde',
+ 'id': 'd8DEBj7QRfetLsRgFnGEyg',
+ 'ext': 'mp4',
+ 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'description': 'original programming',
},
- 'playlist': [{
- 'md5': '',
- 'info_dict': {
- 'id': 'eYiLsKVgQ6qTC6agD67Sig',
- 'ext': 'mp4',
- 'title': 'Toonami - Friday, October 14th, 2016',
- 'description': 'md5:99892c96ffc85e159a428de85c30acde',
- },
- }],
'params': {
# m3u8 download
'skip_download': True,
},
- 'expected_warnings': ['Unable to download f4m manifest'],
}]
- @staticmethod
- def find_video_info(collection, slug):
- for video in collection.get('videos'):
- if video.get('slug') == slug:
- return video
-
- @staticmethod
- def find_collection_by_linkURL(collections, linkURL):
- for collection in collections:
- if collection.get('linkURL') == linkURL:
- return collection
-
- @staticmethod
- def find_collection_containing_video(collections, slug):
- for collection in collections:
- for video in collection.get('videos'):
- if video.get('slug') == slug:
- return collection, video
- return None, None
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- show_path = mobj.group('show_path')
- episode_path = mobj.group('episode_path')
- is_playlist = True if mobj.group('is_playlist') else False
-
- webpage = self._download_webpage(url, episode_path)
-
- # Extract the value of `bootstrappedData` from the Javascript in the page.
- bootstrapped_data = self._parse_json(self._search_regex(
- r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
-
- # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
- # NOTE: We are only downloading one video (the current one) not the playlist
- if is_playlist:
- collections = bootstrapped_data['playlists']['collections']
- collection = self.find_collection_by_linkURL(collections, show_path)
- video_info = self.find_video_info(collection, episode_path)
-
- show_title = video_info['showTitle']
- segment_ids = [video_info['videoPlaybackID']]
+ show_path, episode_path = re.match(self._VALID_URL, url).groups()
+ display_id = episode_path or show_path
+ webpage = self._download_webpage(url, display_id)
+ initial_data = self._parse_json(self._search_regex(
+ r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});',
+ webpage, 'initial data'), display_id)
+
+ is_stream = show_path == 'streams'
+ if is_stream:
+ if not episode_path:
+ episode_path = 'live-stream'
+
+ video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path)
+ video_id = video_data.get('stream')
+
+ if not video_id:
+ entries = []
+ for episode in video_data.get('archiveEpisodes', []):
+ episode_url = episode.get('url')
+ if not episode_url:
+ continue
+ entries.append(self.url_result(
+ episode_url, 'AdultSwim', episode.get('id')))
+ return self.playlist_result(
+ entries, video_data.get('id'), video_data.get('title'),
+ strip_or_none(video_data.get('description')))
else:
- collections = bootstrapped_data['show']['collections']
- collection, video_info = self.find_collection_containing_video(collections, episode_path)
- # Video wasn't found in the collections, let's try `slugged_video`.
- if video_info is None:
- if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
- video_info = bootstrapped_data['slugged_video']
- if not video_info:
- video_info = bootstrapped_data.get(
- 'heroMetadata', {}).get('trailer', {}).get('video')
- if not video_info:
- video_info = bootstrapped_data.get('onlineOriginals', [None])[0]
- if not video_info:
- raise ExtractorError('Unable to find video info')
-
- show = bootstrapped_data['show']
- show_title = show['title']
- stream = video_info.get('stream')
- if stream and stream.get('videoPlaybackID'):
- segment_ids = [stream['videoPlaybackID']]
- elif video_info.get('clips'):
- segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
- elif video_info.get('videoPlaybackID'):
- segment_ids = [video_info['videoPlaybackID']]
- elif video_info.get('id'):
- segment_ids = [video_info['id']]
- else:
- if video_info.get('auth') is True:
- raise ExtractorError(
- 'This video is only available via cable service provider subscription that'
- ' is not currently supported. You may want to use --cookies.', expected=True)
- else:
- raise ExtractorError('Unable to find stream or clips')
-
- episode_id = video_info['id']
- episode_title = video_info['title']
- episode_description = video_info.get('description')
- episode_duration = int_or_none(video_info.get('duration'))
- view_count = int_or_none(video_info.get('views'))
+ show_data = initial_data['show']
+
+ if not episode_path:
+ entries = []
+ for video in show_data.get('videos', []):
+ slug = video.get('slug')
+ if not slug:
+ continue
+ entries.append(self.url_result(
+ 'http://adultswim.com/videos/%s/%s' % (show_path, slug),
+ 'AdultSwim', video.get('id')))
+ return self.playlist_result(
+ entries, show_data.get('id'), show_data.get('title'),
+ strip_or_none(show_data.get('metadata', {}).get('description')))
+
+ video_data = show_data['sluggedVideo']
+ video_id = video_data['id']
+
+ info = self._extract_cvp_info(
+ 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id,
+ video_id, {
+ 'secure': {
+ 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
+ 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
+ },
+ }, {
+ 'url': url,
+ 'site_name': 'AdultSwim',
+ 'auth_required': video_data.get('auth'),
+ })
- entries = []
- for part_num, segment_id in enumerate(segment_ids):
- segement_info = self._extract_cvp_info(
- 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id,
- segment_id, {
- 'secure': {
- 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
- 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
- },
- })
- segment_title = '%s - %s' % (show_title, episode_title)
- if len(segment_ids) > 1:
- segment_title += ' Part %d' % (part_num + 1)
- segement_info.update({
- 'id': segment_id,
- 'title': segment_title,
- 'description': episode_description,
+ info.update({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'description': info.get('description') or strip_or_none(video_data.get('description')),
+ })
+ if not is_stream:
+ info.update({
+ 'duration': info.get('duration') or int_or_none(video_data.get('duration')),
+ 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')),
+ 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')),
+ 'episode': info['title'],
+ 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')),
})
- entries.append(segement_info)
- return {
- '_type': 'playlist',
- 'id': episode_id,
- 'display_id': episode_path,
- 'entries': entries,
- 'title': '%s - %s' % (show_title, episode_title),
- 'description': episode_description,
- 'duration': episode_duration,
- 'view_count': view_count,
- }
+ info['series'] = video_data.get('collection_title') or info.get('series')
+ if info['series'] and info['series'] != info['title']:
+ info['title'] = '%s - %s' % (info['series'], info['title'])
+
+ return info
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index c01c67303..2dcdba9d2 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -101,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE):
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
entries.append(self.url_result(
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
- return self.playlist_result(
- entries, self._html_search_meta('aetn:SeriesId', webpage),
- self._html_search_meta('aetn:SeriesTitle', webpage))
- elif url_parts_len == 2:
+ if entries:
+ return self.playlist_result(
+ entries, self._html_search_meta('aetn:SeriesId', webpage),
+ self._html_search_meta('aetn:SeriesTitle', webpage))
+ else:
+ # single season
+ url_parts_len = 2
+ if url_parts_len == 2:
entries = []
for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
episode_attributes = extract_attributes(episode_item)
@@ -112,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE):
url, episode_attributes['data-canonical'])
entries.append(self.url_result(
episode_url, 'AENetworks',
- episode_attributes['data-videoid']))
+ episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
return self.playlist_result(
entries, self._html_search_meta('aetn:SeasonId', webpage))
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
index 78d29c861..c8cb91dcb 100644
--- a/youtube_dl/extractor/afreecatv.py
+++ b/youtube_dl/extractor/afreecatv.py
@@ -207,11 +207,10 @@ class AfreecaTVIE(InfoExtractor):
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls',
note='Downloading part %d m3u8 information' % file_num)
- title = title if one else '%s (part %d)' % (title, file_num)
file_info = common_entry.copy()
file_info.update({
'id': format_id,
- 'title': title,
+ 'title': title if one else '%s (part %d)' % (title, file_num),
'upload_date': upload_date,
'duration': file_duration,
'formats': formats,
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
index 388e578d5..c68be3134 100644
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@@ -4,9 +4,9 @@ from .common import InfoExtractor
class AlJazeeraIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
'info_dict': {
'id': '3792260579001',
@@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor):
},
'add_ie': ['BrightcoveNew'],
'skip': 'Not accessible from Travis CI server',
- }
+ }, {
+ 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+ 'only_matching': True,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index 98f8e69cd..fde1a8ff7 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -34,9 +34,12 @@ class AMPIE(InfoExtractor):
if isinstance(media_thumbnail, dict):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
- thumbnail = thumbnail_data['@attributes']
+ thumbnail = thumbnail_data.get('@attributes', {})
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
thumbnails.append({
- 'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+ 'url': self._proto_relative_url(thumbnail_url, 'http:'),
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
@@ -47,9 +50,14 @@ class AMPIE(InfoExtractor):
if isinstance(media_subtitle, dict):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
- subtitle = subtitle_data['@attributes']
- lang = subtitle.get('lang') or 'en'
- subtitles[lang] = [{'url': subtitle['href']}]
+ subtitle = subtitle_data.get('@attributes', {})
+ subtitle_href = subtitle.get('href')
+ if not subtitle_href:
+ continue
+ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
+ 'url': subtitle_href,
+ 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
+ })
formats = []
media_content = get_media_node('content')
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py
index 623f44dce..8023da702 100644
--- a/youtube_dl/extractor/anvato.py
+++ b/youtube_dl/extractor/anvato.py
@@ -5,6 +5,7 @@ import base64
import hashlib
import json
import random
+import re
import time
from .common import InfoExtractor
@@ -16,6 +17,7 @@ from ..utils import (
intlist_to_bytes,
int_or_none,
strip_jsonp,
+ unescapeHTML,
)
@@ -26,6 +28,8 @@ def md5_text(s):
class AnvatoIE(InfoExtractor):
+ _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
}
+ _MCP_TO_ACCESS_KEY_TABLE = {
+ 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+ 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+ 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+ 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+ 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+ 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+ 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+ 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+ }
+
+ _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
def __init__(self, *args, **kwargs):
@@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor):
}
if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'):
- # Not using _extract_m3u8_formats here as individual media
- # playlists are also included in published_urls.
- if tbr is None:
- formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
- continue
- else:
+ if tbr is not None:
a_format.update({
'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
'ext': 'mp4',
@@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor):
'subtitles': subtitles,
}
+ @staticmethod
+ def _extract_urls(ie, webpage, video_id):
+ entries = []
+ for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+ anvplayer_data = ie._parse_json(
+ mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+ fatal=False)
+ if not anvplayer_data:
+ continue
+ video = anvplayer_data.get('video')
+ if not isinstance(video, compat_str) or not video.isdigit():
+ continue
+ access_key = anvplayer_data.get('accessKey')
+ if not access_key:
+ mcp = anvplayer_data.get('mcp')
+ if mcp:
+ access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+ mcp.lower())
+ if not access_key:
+ continue
+ entries.append(ie.url_result(
+ 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+ video_id=video))
+ return entries
+
def _extract_anvato_videos(self, webpage, video_id):
- anvplayer_data = self._parse_json(self._html_search_regex(
- r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
- 'Anvato player data'), video_id)
+ anvplayer_data = self._parse_json(
+ self._html_search_regex(
+ self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+ video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+ if access_key not in self._ANVACK_TABLE:
+ access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+ return self._get_anvato_videos(access_key, video_id)
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
index ea7a70393..a84b8b1eb 100644
--- a/youtube_dl/extractor/appleconnect.py
+++ b/youtube_dl/extractor/appleconnect.py
@@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
_TEST = {
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
- 'md5': '10d0f2799111df4cb1c924520ca78f98',
+ 'md5': 'e7c38568a01ea45402570e6029206723',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
'title': 'Energy',
'uploader': 'Drake',
- 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20150710',
'timestamp': 1436545535,
},
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index a6801f3d4..b45b431e1 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor):
}, {
'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
'info_dict': {
- 'id': 'blackthorn',
+ 'id': '4489',
+ 'title': 'Blackthorn',
},
'playlist_mincount': 2,
'expected_warnings': ['Unable to download JSON metadata'],
@@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor):
'title': 'Most Popular',
'id': 'mostpopular',
},
- 'playlist_mincount': 80,
+ 'playlist_mincount': 30,
}, {
'url': 'http://trailers.apple.com/#section=moviestudios',
'info_dict': {
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index e21045bed..3c7d7250b 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor):
}
}, {
'url': 'https://archive.org/details/Cops1922',
- 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',
+ 'md5': '0869000b4ce265e8ca62738b336b268a',
'info_dict': {
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
- 'description': 'md5:b4544662605877edd99df22f9620d858',
+ 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
}
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index 99af6dc5a..01fa308ff 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor):
},
{
'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html',
- 'md5': '0d0e918533bbd4b263f2de4d197d4aac',
+ 'md5': '6e52cbb513c405e403dbacb7aacf8747',
'info_dict': {
'id': 'capitulo-112-david-bustamante',
'ext': 'flv',
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
index 8fc5f65c6..e48bb8972 100644
--- a/youtube_dl/extractor/audioboom.py
+++ b/youtube_dl/extractor/audioboom.py
@@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor):
'title': '3/09/2016 Czaban Hour 3',
'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
'duration': 2245.72,
- 'uploader': 'Steve Czaban',
+ 'uploader': 'SB Nation A.M.',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
}
}, {
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 056e06376..489d0ba53 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor):
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '73d0b3171568232574e45652f8720b5c',
+ 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
'info_dict': {
'id': '2650410135',
- 'ext': 'mp3',
- 'title': 'Lanius (Battle)',
- 'uploader': 'Ben Prunty Music',
+ 'ext': 'aiff',
+ 'title': 'Ben Prunty - Lanius (Battle)',
+ 'uploader': 'Ben Prunty',
},
}]
@@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
+ thumbnail = self._html_search_meta('og:image', webpage, default=None)
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if not m_download:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
@@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor):
return {
'id': track_id,
'title': data['title'],
+ 'thumbnail': thumbnail,
'formats': formats,
'duration': float_or_none(data.get('duration')),
}
@@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': info.get('thumb_url'),
+ 'thumbnail': info.get('thumb_url') or thumbnail,
'uploader': info.get('artist'),
'artist': artist,
'track': track,
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index b0b7914d8..d5c5822f2 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -16,7 +16,7 @@ class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
_TEST = {
'url': 'http://beeg.com/5416503',
- 'md5': '46c384def73b33dbc581262e5ee67cef',
+ 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 80dd8382e..1e3f25515 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -122,6 +122,11 @@ class BiliBiliIE(InfoExtractor):
'preference': -2 if 'hd.mp4' in backup_url else -3,
})
+ for a_format in formats:
+ a_format.setdefault('http_headers', {}).update({
+ 'Referer': url,
+ })
+
self._sort_formats(formats)
entries.append({
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 7a8e1f60b..e829974ff 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor):
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
- 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20',
+ 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
'uploader_id': 6466954,
'upload_date': '20151011',
},
@@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'md5': '8c2c12e3af7805152675446c905d159b',
+ 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index ff0aa11b1..2c32b6ae2 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -77,7 +77,7 @@ class BRIE(InfoExtractor):
'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
'duration': 893,
'uploader': 'Eva Maria Steimle',
- 'upload_date': '20140117',
+ 'upload_date': '20170208',
}
},
]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 97602ca30..0ed59bcbc 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -5,6 +5,7 @@ import re
import json
from .common import InfoExtractor
+from .adobepass import AdobePassIE
from ..compat import (
compat_etree_fromstring,
compat_parse_qs,
@@ -448,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor):
return info
-class BrightcoveNewIE(InfoExtractor):
+class BrightcoveNewIE(AdobePassIE):
IE_NAME = 'brightcove:new'
_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
@@ -522,7 +523,7 @@ class BrightcoveNewIE(InfoExtractor):
# [2] looks like:
for video, script_tag, account_id, player_id, embed in re.findall(
r'''(?isx)
- (<video\s+[^>]*data-video-id=['"]?[^>]+>)
+ (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
(?:.*?
(<script[^>]+
src=["\'](?:https?:)?//players\.brightcove\.net/
@@ -602,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor):
raise ExtractorError(message, expected=True)
raise
+ errors = json_data.get('errors')
+ if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
+ custom_fields = json_data['custom_fields']
+ tve_token = self._extract_mvpd_auth(
+ smuggled_data['source_url'], video_id,
+ custom_fields['bcadobepassrequestorid'],
+ custom_fields['bcadobepassresourceid'])
+ json_data = self._download_json(
+ api_url, video_id, headers={
+ 'Accept': 'application/json;pk=%s' % policy_key
+ }, query={
+ 'tveToken': tve_token,
+ })
+
title = json_data['name'].strip()
formats = []
@@ -667,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor):
})
formats.append(f)
- errors = json_data.get('errors')
if not formats and errors:
error = errors[0]
raise ExtractorError(
@@ -684,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor):
is_live = False
duration = float_or_none(json_data.get('duration'), 1000)
- if duration and duration < 0:
+ if duration is not None and duration <= 0:
is_live = True
return {
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index f1f128c45..acd87e371 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor):
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Terrasses du Numérique',
'duration': 122,
},
- 'params': {
- 'skip_download': True, # Requires rtmpdump
- }
}, {
'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
'only_matching': True,
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
index cf678e7f8..87ad14e91 100644
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -96,6 +96,7 @@ class CBCIE(InfoExtractor):
'info_dict': {
'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
'id': 'dog-indoor-exercise-winter-1.3928238',
+ 'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
},
'playlist_mincount': 6,
}]
@@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor):
'uploader': 'CBCC-NEW',
},
}, {
- # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
'url': 'http://www.cbc.ca/player/play/2164402062',
- 'md5': '17a61eb813539abea40618d6323a7f82',
+ 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py
index 8d5f11dd1..7d78e3aae 100644
--- a/youtube_dl/extractor/cbslocal.py
+++ b/youtube_dl/extractor/cbslocal.py
@@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE):
'title': 'A Very Blue Anniversary',
'description': 'CBS2’s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
- 'timestamp': 1479962220,
- 'upload_date': '20161124',
+ 'timestamp': int,
+ 'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index 1ee35b501..78b7a923c 100755
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -9,7 +9,10 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ multipart_encode,
parse_duration,
+ random_birthday,
+ urljoin,
)
@@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):
'description': 'md5:269ccd135d550da90d1662651fcb9772',
'thumbnail': r're:^https?://.*\.jpg$',
'average_rating': float,
- 'duration': 39
+ 'duration': 39,
+ 'age_limit': 0,
}
}, {
'url': 'http://www.cda.pl/video/57413289',
@@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):
'uploader': 'crash404',
'view_count': int,
'average_rating': float,
- 'duration': 137
+ 'duration': 137,
+ 'age_limit': 0,
}
}, {
+ # Age-restricted
+ 'url': 'http://www.cda.pl/video/1273454c4',
+ 'info_dict': {
+ 'id': '1273454c4',
+ 'ext': 'mp4',
+ 'title': 'Bronson (2008) napisy HD 1080p',
+ 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+ 'height': 1080,
+ 'uploader': 'boniek61',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5554,
+ 'age_limit': 18,
+ 'view_count': int,
+ 'average_rating': float,
+ },
+ }, {
'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True,
}]
+ def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
+ form_data = random_birthday('rok', 'miesiac', 'dzien')
+ form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
+ data, content_type = multipart_encode(form_data)
+ return self._download_webpage(
+ urljoin(url, '/a/validatebirth'), video_id, *args,
+ data=data, headers={
+ 'Referer': url,
+ 'Content-Type': content_type,
+ }, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
self._set_cookie('cda.pl', 'cda.player', 'html5')
@@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True)
+ need_confirm_age = False
+ if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+ webpage, 'birthday validate form', default=None):
+ webpage = self._download_age_confirm_page(
+ url, video_id, note='Confirming age')
+ need_confirm_age = True
+
formats = []
uploader = self._search_regex(r'''(?x)
@@ -81,6 +120,7 @@ class CDAIE(InfoExtractor):
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'duration': None,
+ 'age_limit': 18 if need_confirm_age else 0,
}
def extract_format(page, version):
@@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):
for href, resolution in re.findall(
r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
webpage):
- webpage = self._download_webpage(
+ if need_confirm_age:
+ handler = self._download_age_confirm_page
+ else:
+ handler = self._download_webpage
+
+ webpage = handler(
self._BASE_URL + href, video_id,
'Downloading %s version information' % resolution, fatal=False)
if not webpage:
@@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):
# invalid version is requested.
self.report_warning('Unable to download %s version information' % resolution)
continue
+
extract_format(webpage, resolution)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index bb52e0c6f..0920f6219 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
- 'md5': '720563e467b86374c194bdead08d207d',
+ 'md5': 'b9a5dc46294154c1193e2d10e0c95693',
'info_dict': {
'id': '4343170',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
index 18c734766..6a41db87c 100644
--- a/youtube_dl/extractor/collegerama.py
+++ b/youtube_dl/extractor/collegerama.py
@@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
'description': '',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 7713.088,
'timestamp': 1413309600,
'upload_date': '20141014',
@@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'wmv',
'title': '64ste Vakantiecursus: Afvalwater',
'description': 'md5:7fd774865cc69d972f542b157c328305',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 10853,
'timestamp': 1326446400,
'upload_date': '20120113',
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 8b3f04c61..fec39da8b 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -245,6 +245,10 @@ class InfoExtractor(object):
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ chapters: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the chapter in seconds
+ * "end_time" - The end time of the chapter in seconds
+ * "title" (optional, string)
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -990,6 +994,7 @@ class InfoExtractor(object):
'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')),
'height': int_or_none(e.get('height')),
+ 'view_count': int_or_none(e.get('interactionCount')),
})
for e in json_ld:
@@ -1334,7 +1339,7 @@ class InfoExtractor(object):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+ formats = []
format_url = lambda u: (
u
@@ -1386,6 +1391,7 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
'language': media.get('LANGUAGE'),
'ext': ext,
'protocol': entry_protocol,
@@ -1438,7 +1444,7 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
- 'manifest_url': manifest_url,
+ 'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
@@ -1995,6 +2001,12 @@ class InfoExtractor(object):
compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ """
+ Parse formats from ISM manifest.
+ References:
+ 1. [MS-SSTR]: Smooth Streaming Protocol,
+ https://msdn.microsoft.com/en-us/library/ff469518.aspx
+ """
if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
return []
@@ -2016,8 +2028,11 @@ class InfoExtractor(object):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
- width = int_or_none(track.get('MaxWidth'))
- height = int_or_none(track.get('MaxHeight'))
+ # [1] does not mention Width and Height attributes. However,
+ # they're often present while MaxWidth and MaxHeight are
+ # missing, so should be used as fallbacks
+ width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+ height = int_or_none(track.get('MaxHeight') or track.get('Height'))
sampling_rate = int_or_none(track.get('SamplingRate'))
track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
@@ -2168,7 +2183,7 @@ class InfoExtractor(object):
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats = []
hdcore_sign = 'hdcore=3.7.0'
- f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
hds_host = hosts.get('hds')
if hds_host:
f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
@@ -2190,8 +2205,9 @@ class InfoExtractor(object):
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
- url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
- http_base_url = 'http' + url_base
+ url_base = self._search_regex(
+ r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
+ http_base_url = '%s:%s' % ('http', url_base)
formats = []
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
@@ -2225,7 +2241,7 @@ class InfoExtractor(object):
for protocol in ('rtmp', 'rtsp'):
if protocol not in skip_protocols:
formats.append({
- 'url': protocol + url_base,
+ 'url': '%s:%s' % (protocol, url_base),
'format_id': protocol,
'protocol': protocol,
})
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index d3463b874..0c3f0c0e4 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -16,7 +16,6 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
- remove_end,
)
@@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor):
'wmagazine': 'W Magazine',
}
- _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/
+ (?:
+ (?:
+ embed(?:js)?|
+ (?:script|inline)/video
+ )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?|
+ (?P<type>watch|series|video)/(?P<display_id>[^/?#]+)
+ )''' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
- EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
+ EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
_TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
@@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor):
'upload_date': '20150916',
'timestamp': 1442434955,
}
+ }, {
+ 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js',
+ 'only_matching': True,
}]
def _extract_series(self, url, webpage):
@@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor):
entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
return self.playlist_result(entries, playlist_title=title)
- def _extract_video(self, webpage, url_type):
+ def _extract_video_params(self, webpage):
query = {}
params = self._search_regex(
r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None)
@@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor):
'playerId': params['data-player'],
'target': params['id'],
})
- video_id = query['videoId']
+ return query
+
+ def _extract_video(self, params):
+ video_id = params['videoId']
+
video_info = None
- info_page = self._download_json(
- 'http://player.cnevids.com/player/video.js',
- video_id, 'Downloading video info', fatal=False, query=query)
- if info_page:
- video_info = info_page.get('video')
- if not video_info:
+ if params.get('playerId'):
+ info_page = self._download_json(
+ 'http://player.cnevids.com/player/video.js',
+ video_id, 'Downloading video info', fatal=False, query=params)
+ if info_page:
+ video_info = info_page.get('video')
+ if not video_info:
+ info_page = self._download_webpage(
+ 'http://player.cnevids.com/player/loader.js',
+ video_id, 'Downloading loader info', query=params)
+ else:
info_page = self._download_webpage(
- 'http://player.cnevids.com/player/loader.js',
- video_id, 'Downloading loader info', query=query)
+ 'https://player.cnevids.com/inline/video/%s.js' % video_id,
+ video_id, 'Downloading inline info', query={
+ 'target': params.get('target', 'embedplayer')
+ })
+
+ if not video_info:
video_info = self._parse_json(
self._search_regex(
r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
@@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)
- info = self._search_json_ld(
- webpage, video_id, fatal=False) if url_type != 'embed' else {}
- info.update({
+ return {
'id': video_id,
'formats': formats,
'title': title,
@@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor):
'series': video_info.get('series_title'),
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
- })
- return info
+ 'categories': video_info.get('categories'),
+ }
def _real_extract(self, url):
- site, url_type, item_id = re.match(self._VALID_URL, url).groups()
+ video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups()
- # Convert JS embed to regular embed
- if url_type == 'embedjs':
- parsed_url = compat_urlparse.urlparse(url)
- url = compat_urlparse.urlunparse(parsed_url._replace(
- path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
- url_type = 'embed'
+ if video_id:
+ return self._extract_video({
+ 'videoId': video_id,
+ 'playerId': player_id,
+ 'target': target,
+ })
- webpage = self._download_webpage(url, item_id)
+ webpage = self._download_webpage(url, display_id)
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- return self._extract_video(webpage, url_type)
+ params = self._extract_video_params(webpage)
+ info = self._search_json_ld(
+ webpage, display_id, fatal=False)
+ info.update(self._extract_video(params))
+ return info
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py
index 5fa1f006b..6ea03e65c 100644
--- a/youtube_dl/extractor/coub.py
+++ b/youtube_dl/extractor/coub.py
@@ -24,12 +24,11 @@ class CoubIE(InfoExtractor):
'duration': 4.6,
'timestamp': 1428527772,
'upload_date': '20150408',
- 'uploader': 'Артём Лоскутников',
+ 'uploader': 'Artyom Loskutnikov',
'uploader_id': 'artyom.loskutnikov',
'view_count': int,
'like_count': int,
'repost_count': int,
- 'comment_count': int,
'age_limit': 0,
},
}, {
@@ -118,7 +117,6 @@ class CoubIE(InfoExtractor):
view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
like_count = int_or_none(coub.get('likes_count'))
repost_count = int_or_none(coub.get('recoubs_count'))
- comment_count = int_or_none(coub.get('comments_count'))
age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
if age_restricted is not None:
@@ -137,7 +135,6 @@ class CoubIE(InfoExtractor):
'view_count': view_count,
'like_count': like_count,
'repost_count': repost_count,
- 'comment_count': comment_count,
'age_limit': age_limit,
'formats': formats,
}
diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py
index f919ed208..13f425b2b 100644
--- a/youtube_dl/extractor/crackle.py
+++ b/youtube_dl/extractor/crackle.py
@@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor):
'season_number': 8,
'episode_number': 4,
'subtitles': {
- 'en-US': [{
- 'ext': 'ttml',
- }]
+ 'en-US': [
+ {'ext': 'vtt'},
+ {'ext': 'tt'},
+ ]
},
},
'params': {
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 2ed8b30bb..2ffa4a7f8 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': {
'id': '727589',
'ext': 'mp4',
- 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!",
+ 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!",
'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Kadokawa Pictures Inc.',
@@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'series': "KONOSUBA -God's blessing on this wonderful world!",
'season': "KONOSUBA -God's blessing on this wonderful world! 2",
'season_number': 2,
- 'episode': 'Give Me Deliverance from this Judicial Injustice!',
+ 'episode': 'Give Me Deliverance From This Judicial Injustice!',
'episode_number': 1,
},
'params': {
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d4576160b..171820e27 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -10,6 +10,7 @@ from ..utils import (
smuggle_url,
determine_ext,
ExtractorError,
+ extract_attributes,
)
from .senateisvp import SenateISVPIE
from .ustream import UstreamIE
@@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor):
'uploader_id': '12987475',
},
}]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor):
if ustream_url:
return self.url_result(ustream_url, UstreamIE.ie_key())
+ if '&vod' not in url:
+ bc = self._search_regex(
+ r"(<[^>]+id='brightcove-player-embed'[^>]+>)",
+ webpage, 'brightcove embed', default=None)
+ if bc:
+ bc_attr = extract_attributes(bc)
+ bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (
+ bc_attr.get('data-bcaccountid', '3162030207001'),
+ bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),
+ bc_attr.get('data-newbcplayerid', 'default'),
+ bc_attr['data-bcid'])
+ return self.url_result(smuggle_url(bc_url, {'source_url': url}))
+
# We first look for clipid, because clipprog always appears before
patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
results = list(filter(None, (re.search(p, webpage) for p in patterns)))
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
index 98c835bf1..538565c66 100644
--- a/youtube_dl/extractor/dailymail.py
+++ b/youtube_dl/extractor/dailymail.py
@@ -2,9 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
int_or_none,
determine_protocol,
+ try_get,
unescapeHTML,
)
@@ -28,8 +30,14 @@ class DailyMailIE(InfoExtractor):
video_data = self._parse_json(self._search_regex(
r"data-opts='({.+?})'", webpage, 'video data'), video_id)
title = unescapeHTML(video_data['title'])
- video_sources = self._download_json(video_data.get(
- 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+ sources_url = (try_get(
+ video_data,
+ (lambda x: x['plugins']['sources']['url'],
+ lambda x: x['sources']['url']), compat_str) or
+ 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id)
+
+ video_sources = self._download_json(sources_url, video_id)
formats = []
for rendition in video_sources['renditions']:
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 246efde43..f8db76c18 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor):
- _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)'
+ _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion'
_FORMATS = [
@@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
('stream_h264_hd1080_url', 'hd180'),
]
- _TESTS = [
- {
- 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
- 'md5': '2137c41a8e78554bb09225b8eb322406',
- 'info_dict': {
- 'id': 'x2iuewm',
- 'ext': 'mp4',
- 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
- 'description': 'Several come bundled with the Steam Controller.',
- 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
- 'duration': 74,
- 'timestamp': 1425657362,
- 'upload_date': '20150306',
- 'uploader': 'IGN',
- 'uploader_id': 'xijv66',
- 'age_limit': 0,
- 'view_count': int,
- }
+ _TESTS = [{
+ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
+ 'md5': '074b95bdee76b9e3654137aee9c79dfe',
+ 'info_dict': {
+ 'id': 'x5kesuj',
+ 'ext': 'mp4',
+ 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
+ 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 187,
+ 'timestamp': 1493651285,
+ 'upload_date': '20170501',
+ 'uploader': 'Deadline',
+ 'uploader_id': 'x1xm8ri',
+ 'age_limit': 0,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
+ 'md5': '2137c41a8e78554bb09225b8eb322406',
+ 'info_dict': {
+ 'id': 'x2iuewm',
+ 'ext': 'mp4',
+ 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+ 'description': 'Several come bundled with the Steam Controller.',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 74,
+ 'timestamp': 1425657362,
+ 'upload_date': '20150306',
+ 'uploader': 'IGN',
+ 'uploader_id': 'xijv66',
+ 'age_limit': 0,
+ 'view_count': int,
},
+ 'skip': 'video gone',
+ }, {
# Vevo video
- {
- 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
- 'info_dict': {
- 'title': 'Roar (Official)',
- 'id': 'USUV71301934',
- 'ext': 'mp4',
- 'uploader': 'Katy Perry',
- 'upload_date': '20130905',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'VEVO is only available in some countries',
+ 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
+ 'info_dict': {
+ 'title': 'Roar (Official)',
+ 'id': 'USUV71301934',
+ 'ext': 'mp4',
+ 'uploader': 'Katy Perry',
+ 'upload_date': '20130905',
+ },
+ 'params': {
+ 'skip_download': True,
},
+ 'skip': 'VEVO is only available in some countries',
+ }, {
# age-restricted video
- {
- 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
- 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
- 'info_dict': {
- 'id': 'xyh2zz',
- 'ext': 'mp4',
- 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
- 'uploader': 'HotWaves1012',
- 'age_limit': 18,
- },
- 'skip': 'video gone',
+ 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+ 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
+ 'info_dict': {
+ 'id': 'xyh2zz',
+ 'ext': 'mp4',
+ 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+ 'uploader': 'HotWaves1012',
+ 'age_limit': 18,
},
+ 'skip': 'video gone',
+ }, {
# geo-restricted, player v5
- {
- 'url': 'http://www.dailymotion.com/video/xhza0o',
- 'only_matching': True,
- },
+ 'url': 'http://www.dailymotion.com/video/xhza0o',
+ 'only_matching': True,
+ }, {
# with subtitles
- {
- 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
- 'only_matching': True,
- },
- {
- 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
- 'only_matching': True,
- }
- ]
+ 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
+ 'only_matching': True,
+ }]
@staticmethod
def _extract_urls(webpage):
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
index bdfe638b4..5c9c0ecdc 100644
--- a/youtube_dl/extractor/democracynow.py
+++ b/youtube_dl/extractor/democracynow.py
@@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor):
'info_dict': {
'id': '2015-0703-001',
'ext': 'mp4',
- 'title': 'Daily Show',
+ 'title': 'Daily Show for July 03, 2015',
+ 'description': 'md5:80eb927244d6749900de6072c7cc2c86',
},
}, {
'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index 1f75352ca..148605c0b 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor):
'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',
'duration': 290,
'timestamp': 1476767794.2809999,
- 'upload_date': '20160525',
+ 'upload_date': '20161018',
'uploader': 'parthivi001',
'uploader_id': 'user52596202',
'view_count': int,
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 82d8a042f..9757f4422 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -3,11 +3,14 @@ from __future__ import unicode_literals
import time
import hashlib
+import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
+ unified_strdate,
+ urljoin,
)
@@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': 'iseven',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': '17732',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor):
'uploader': uploader,
'is_live': True,
}
+
+
+class DouyuShowIE(InfoExtractor):
+ _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
+
+ _TESTS = [{
+ 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
+ 'md5': '0c2cfd068ee2afe657801269b2d86214',
+ 'info_dict': {
+ 'id': 'rjNBdvnVXNzvE2yw',
+ 'ext': 'mp4',
+ 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
+ 'duration': 7150.08,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': '陈一发儿',
+ 'uploader_id': 'XrZwYelr5wbK',
+ 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
+ 'upload_date': '20170402',
+ },
+ }, {
+ 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ url = url.replace('vmobile.', 'v.')
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ room_info = self._parse_json(self._search_regex(
+ r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
+
+ video_info = None
+
+ for trial in range(5):
+ # Sometimes Douyu rejects our request. Let's try it more times
+ try:
+ video_info = self._download_json(
+ 'https://vmobile.douyu.com/video/getInfo', video_id,
+ query={'vid': video_id},
+ headers={
+ 'Referer': url,
+ 'x-requested-with': 'XMLHttpRequest',
+ })
+ break
+ except ExtractorError:
+ self._sleep(1, video_id)
+
+ if not video_info:
+ raise ExtractorError('Can\'t fetch video info')
+
+ formats = self._extract_m3u8_formats(
+ video_info['data']['video_url'], video_id,
+ entry_protocol='m3u8_native', ext='mp4')
+
+ upload_date = unified_strdate(self._html_search_regex(
+ r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
+ 'upload date', fatal=False))
+
+ uploader = uploader_id = uploader_url = None
+ mobj = re.search(
+ r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
+ webpage)
+ if mobj:
+ uploader_id, uploader = mobj.groups()
+ uploader_url = urljoin(url, '/author/' + uploader_id)
+
+ return {
+ 'id': video_id,
+ 'title': room_info['name'],
+ 'formats': formats,
+ 'duration': room_info.get('duration'),
+ 'thumbnail': room_info.get('pic'),
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ }
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index e4917014a..c84624f1e 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor):
IE_NAME = 'drtv'
_TESTS = [{
'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
- 'md5': '25e659cccc9a2ed956110a299fdf5983',
+ 'md5': '7ae17b4e18eb5d29212f424a7511c184',
'info_dict': {
'id': 'klassen-darlig-taber-10',
'ext': 'mp4',
@@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor):
'upload_date': '20160823',
'duration': 606.84,
},
- 'params': {
- 'skip_download': True,
- },
}, {
+ # embed
'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
- 'md5': '2c37175c718155930f939ef59952474a',
'info_dict': {
'id': 'christiania-pusher-street-ryddes-drdkrjpo',
'ext': 'mp4',
'title': 'LIVE Christianias rydning af Pusher Street er i gang',
- 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.',
+ 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
'timestamp': 1472800279,
'upload_date': '20160902',
'duration': 131.4,
},
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # with SignLanguage formats
+ 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
+ 'info_dict': {
+ 'id': 'historien-om-danmark-stenalder',
+ 'ext': 'mp4',
+ 'title': 'Historien om Danmark: Stenalder (1)',
+ 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
+ 'timestamp': 1490401996,
+ 'upload_date': '20170325',
+ 'duration': 3502.04,
+ 'formats': 'mincount:20',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor):
elif kind in ('VideoResource', 'AudioResource'):
duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
restricted_to_denmark = asset.get('RestrictedToDenmark')
- spoken_subtitles = asset.get('Target') == 'SpokenSubtitles'
+ asset_target = asset.get('Target')
for link in asset.get('Links', []):
uri = link.get('Uri')
if not uri:
@@ -96,9 +112,9 @@ class DRTVIE(InfoExtractor):
target = link.get('Target')
format_id = target or ''
preference = None
- if spoken_subtitles:
+ if asset_target in ('SpokenSubtitles', 'SignLanguage'):
preference = -1
- format_id += '-spoken-subtitles'
+ format_id += '-%s' % asset_target
if target == 'HDS':
f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 355a4e56f..ed603eb29 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE
from .amcnetworks import AMCNetworksIE
from .animeondemand import AnimeOnDemandIE
from .anitube import AnitubeIE
+from .anvato import AnvatoIE
from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
@@ -250,7 +251,10 @@ from .democracynow import DemocracynowIE
from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
-from .douyutv import DouyuTVIE
+from .douyutv import (
+ DouyuShowIE,
+ DouyuTVIE,
+)
from .dplay import (
DPlayIE,
DPlayItIE,
@@ -349,9 +353,9 @@ from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
- PluzzIE,
- FranceTvInfoIE,
FranceTVIE,
+ FranceTVEmbedIE,
+ FranceTVInfoIE,
GenerationQuoiIE,
CultureboxIE,
)
@@ -541,6 +545,7 @@ from .mangomolo import (
)
from .matchtv import MatchTVIE
from .mdr import MDRIE
+from .mediaset import MediasetIE
from .medici import MediciIE
from .meipai import MeipaiIE
from .melonvod import MelonVODIE
@@ -662,6 +667,8 @@ from .nintendo import NintendoIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE
+from .nonktube import NonkTubeIE
+from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
from .nova import NovaIE
@@ -730,8 +737,8 @@ from .openload import OpenloadIE
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
- ORFOE1IE,
ORFFM4IE,
+ ORFOE1IE,
ORFIPTVIE,
)
from .packtpub import (
@@ -1096,6 +1103,10 @@ from .uplynk import (
UplynkIE,
UplynkPreplayIE,
)
+from .upskill import (
+ UpskillIE,
+ UpskillCourseIE,
+)
from .urort import UrortIE
from .urplay import URPlayIE
from .usanetwork import USANetworkIE
@@ -1123,6 +1134,7 @@ from .vgtv import (
from .vh1 import VH1IE
from .vice import (
ViceIE,
+ ViceArticleIE,
ViceShowIE,
)
from .viceland import VicelandIE
@@ -1298,5 +1310,6 @@ from .youtube import (
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE
+from .zaq1 import Zaq1IE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
index a3bb98377..985542727 100644
--- a/youtube_dl/extractor/foxsports.py
+++ b/youtube_dl/extractor/foxsports.py
@@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TEST = {
- 'url': 'http://www.foxsports.com/video?vid=432609859715',
+ 'url': 'http://www.foxsports.com/tennessee/video/432609859715',
'md5': 'b49050e955bebe32c301972e4012ac17',
'info_dict': {
- 'id': 'i0qKWsk3qJaM',
+ 'id': 'bwduI3X_TgUB',
'ext': 'mp4',
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
'description': 'Courtney Lee talks about Memphis being focused.',
@@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config = self._parse_json(
- self._search_regex(
- r"data-player-config='([^']+)'", webpage, 'data player config'),
+ self._html_search_regex(
+ r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""",
+ webpage, 'data player config'),
video_id)
return self.url_result(smuggle_url(update_url_query(
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 48d43ae58..546d5caa0 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -21,11 +21,13 @@ from .dailymotion import (
class FranceTVBaseInfoExtractor(InfoExtractor):
- def _extract_video(self, video_id, catalogue):
+ def _extract_video(self, video_id, catalogue=None):
info = self._download_json(
- 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s'
- % (video_id, catalogue),
- video_id, 'Downloading video JSON')
+ 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
+ video_id, 'Downloading video JSON', query={
+ 'idDiffusion': video_id,
+ 'catalogue': catalogue or '',
+ })
if info.get('status') == 'NOK':
raise ExtractorError(
@@ -109,27 +111,97 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
}
-class PluzzIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'pluzz.francetv.fr'
- _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html'
+class FranceTVIE(FranceTVBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P<id>[^/]+)\.html'
- # Can't use tests, videos expire in 7 days
+ _TESTS = [{
+ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
+ 'info_dict': {
+ 'id': '157550144',
+ 'ext': 'mp4',
+ 'title': '13h15, le dimanche... - Les mystères de Jésus',
+ 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
+ 'timestamp': 1494156300,
+ 'upload_date': '20170507',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ }, {
+ # france3
+ 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
+ 'only_matching': True,
+ }, {
+ # france4
+ 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
+ 'only_matching': True,
+ }, {
+ # france5
+ 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
+ 'only_matching': True,
+ }, {
+ # franceo
+ 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
+ 'only_matching': True,
+ }, {
+ # france2 live
+ 'url': 'https://www.france.tv/france-2/direct.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._html_search_meta(
- 'id_video', webpage, 'video id', default=None)
+ catalogue = None
+ video_id = self._search_regex(
+ r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
+ webpage, 'video id', default=None, group='id')
+
if not video_id:
- video_id = self._search_regex(
- r'data-diffusion=["\'](\d+)', webpage, 'video id')
+ video_id, catalogue = self._html_search_regex(
+ r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
+ webpage, 'video ID').split('@')
+ return self._extract_video(video_id, catalogue)
- return self._extract_video(video_id, 'Pluzz')
+class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
+ _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
-class FranceTvInfoIE(FranceTVBaseInfoExtractor):
+ _TEST = {
+ 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
+ 'info_dict': {
+ 'id': 'NI_983319',
+ 'ext': 'mp4',
+ 'title': 'Le Pen Reims',
+ 'upload_date': '20170505',
+ 'timestamp': 1493981780,
+ 'duration': 16,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
+ video_id)
+
+ return self._extract_video(video['video_id'], video.get('catalog'))
+
+
+class FranceTVInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)'
@@ -233,124 +305,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
return self._extract_video(video_id, catalogue)
-class FranceTVIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'francetv'
- IE_DESC = 'France 2, 3, 4, 5 and Ô'
- _VALID_URL = r'''(?x)
- https?://
- (?:
- (?:www\.)?france[2345o]\.fr/
- (?:
- emissions/[^/]+/(?:videos|diffusions)|
- emission/[^/]+|
- videos|
- jt
- )
- /|
- embed\.francetv\.fr/\?ue=
- )
- (?P<id>[^/?]+)
- '''
-
- _TESTS = [
- # france2
- {
- 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
- 'md5': 'c03fc87cb85429ffd55df32b9fc05523',
- 'info_dict': {
- 'id': '109169362',
- 'ext': 'flv',
- 'title': '13h15, le dimanche...',
- 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7',
- 'upload_date': '20140914',
- 'timestamp': 1410693600,
- },
- },
- # france3
- {
- 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
- 'md5': '679bb8f8921f8623bd658fa2f8364da0',
- 'info_dict': {
- 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
- 'ext': 'mp4',
- 'title': 'Le scandale du prix des médicaments',
- 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',
- 'upload_date': '20131113',
- 'timestamp': 1384380000,
- },
- },
- # france4
- {
- 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
- 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c',
- 'info_dict': {
- 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
- 'ext': 'mp4',
- 'title': 'Hero Corp Making of - Extrait 1',
- 'description': 'md5:c87d54871b1790679aec1197e73d650a',
- 'upload_date': '20131106',
- 'timestamp': 1383766500,
- },
- },
- # france5
- {
- 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1',
- 'md5': 'f6c577df3806e26471b3d21631241fd0',
- 'info_dict': {
- 'id': '123327454',
- 'ext': 'flv',
- 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?',
- 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4',
- 'upload_date': '20150831',
- 'timestamp': 1441035120,
- },
- },
- # franceo
- {
- 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
- 'md5': '47d5816d3b24351cdce512ad7ab31da8',
- 'info_dict': {
- 'id': '125377621',
- 'ext': 'flv',
- 'title': 'Infô soir',
- 'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
- 'upload_date': '20150718',
- 'timestamp': 1437241200,
- 'duration': 414,
- },
- },
- {
- # francetv embed
- 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
- 'info_dict': {
- 'id': 'EV_30231',
- 'ext': 'flv',
- 'title': 'Alcaline, le concert avec Calogero',
- 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
- 'upload_date': '20150226',
- 'timestamp': 1424989860,
- 'duration': 5400,
- },
- },
- {
- 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
- 'only_matching': True,
- },
- {
- 'url': 'http://www.franceo.fr/videos/125377617',
- 'only_matching': True,
- }
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_id, catalogue = self._html_search_regex(
- r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
- webpage, 'video ID').split('@')
- return self._extract_video(video_id, catalogue)
-
-
class GenerationQuoiIE(InfoExtractor):
IE_NAME = 'france2.fr:generation-quoi'
_VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)'
diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py
index e44a2a87f..8c37509ec 100644
--- a/youtube_dl/extractor/funimation.py
+++ b/youtube_dl/extractor/funimation.py
@@ -2,15 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_urllib_parse_unquote_plus,
-)
+from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
int_or_none,
js_to_json,
- sanitized_Request,
ExtractorError,
urlencode_postdata
)
@@ -20,6 +16,7 @@ class FunimationIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
_NETRC_MACHINE = 'funimation'
+ _TOKEN = None
_TESTS = [{
'url': 'https://www.funimation.com/shows/hacksign/role-play/',
@@ -38,56 +35,38 @@ class FunimationIE(InfoExtractor):
}, {
'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
'info_dict': {
- 'id': '9635',
+ 'id': '210051',
'display_id': 'broadcast-dub-preview',
'ext': 'mp4',
'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
- 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
- 'skip': 'Access without user interaction is forbidden by CloudFlare',
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
'only_matching': True,
}]
- _LOGIN_URL = 'http://www.funimation.com/login'
-
- def _extract_cloudflare_session_ua(self, url):
- ci_session_cookie = self._get_cookies(url).get('ci_session')
- if ci_session_cookie:
- ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
- # ci_session is a string serialized by PHP function serialize()
- # This case is simple enough to use regular expressions only
- return self._search_regex(
- r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
- default=None)
-
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
- data = urlencode_postdata({
- 'email_field': username,
- 'password_field': password,
- })
- user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
- if not user_agent:
- user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
- login_request = sanitized_Request(self._LOGIN_URL, data, headers={
- 'User-Agent': user_agent,
- 'Content-Type': 'application/x-www-form-urlencoded'
- })
- login_page = self._download_webpage(
- login_request, None, 'Logging in as %s' % username)
- if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')):
- return
- error = self._html_search_regex(
- r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>',
- login_page, 'error messages', default=None)
- if error:
- raise ExtractorError('Unable to login: %s' % error, expected=True)
- raise ExtractorError('Unable to log in')
+ try:
+ data = self._download_json(
+ 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/',
+ None, 'Logging in as %s' % username, data=urlencode_postdata({
+ 'username': username,
+ 'password': password,
+ }))
+ self._TOKEN = data['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ error = self._parse_json(e.cause.read().decode(), None)['error']
+ raise ExtractorError(error, expected=True)
+ raise
def _real_initialize(self):
self._login()
@@ -125,9 +104,12 @@ class FunimationIE(InfoExtractor):
description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)
try:
+ headers = {}
+ if self._TOKEN:
+ headers['Authorization'] = 'Token %s' % self._TOKEN
sources = self._download_json(
'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id,
- video_id)['items']
+ video_id, headers=headers)['items']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
error = self._parse_json(e.cause.read(), video_id)['errors'][0]
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 81c0ce9a3..49409369c 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor):
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
source_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
bitrates.sort()
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 682c49e79..00d311158 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -78,8 +78,7 @@ class GameSpotIE(OnceIE):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 3136427db..f71d9092e 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor):
'format': 'jp', # The japanese audio
}
},
+ {
+ # gdc-player.html
+ 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo',
+ 'info_dict': {
+ 'id': '1435',
+ 'display_id': 'An-American-engine-in-Tokyo',
+ 'ext': 'flv',
+ 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT',
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ },
+ },
]
def _login(self, webpage_url, display_id):
@@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor):
'title': title,
}
- PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>'
+ PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'
xml_root = self._html_search_regex(
PLAYER_REGEX, start_page, 'xml root', default=None)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 67184bc5d..c108d4a8a 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -86,6 +86,10 @@ from .openload import OpenloadIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
class GenericIE(InfoExtractor):
@@ -1427,6 +1431,22 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # Brightcove embed with whitespace around attribute names
+ 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+ 'info_dict': {
+ 'id': '3167554373001',
+ 'ext': 'mp4',
+ 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+ 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+ 'uploader_id': '1079349493',
+ 'upload_date': '20140207',
+ 'timestamp': 1391810548,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Another form of arte.tv embed
{
'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1677,6 +1697,42 @@ class GenericIE(InfoExtractor):
},
'playlist_mincount': 5,
},
+ {
+ 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+ 'info_dict': {
+ 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+ 'title': 'Standoff with Walnut Creek murder suspect ends',
+ 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+ },
+ 'playlist_mincount': 4,
+ },
+ {
+ # WashingtonPost embed
+ 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+ 'info_dict': {
+ 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+ 'ext': 'mp4',
+ 'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+ 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+ 'timestamp': 1455216756,
+ 'uploader': 'The Washington Post',
+ 'upload_date': '20160211',
+ },
+ 'add_ie': [WashingtonPostIE.ie_key()],
+ },
+ {
+ # Mediaset embed
+ 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+ 'info_dict': {
+ 'id': '720642',
+ 'ext': 'mp4',
+ 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [MediasetIE.ie_key()],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -2070,57 +2126,20 @@ class GenericIE(InfoExtractor):
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
- match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
- if match:
- embed_url = self._proto_relative_url(
- unescapeHTML(match.group('url')))
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'ie_key': 'Wistia',
- 'uploader': video_uploader,
- }
-
- match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
- if match:
+ wistia_url = WistiaIE._extract_url(webpage)
+ if wistia_url:
return {
'_type': 'url_transparent',
- 'url': 'wistia:%s' % match.group('id'),
- 'ie_key': 'Wistia',
+ 'url': self._proto_relative_url(wistia_url),
+ 'ie_key': WistiaIE.ie_key(),
'uploader': video_uploader,
}
- match = re.search(
- r'''(?sx)
- <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
- <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
- ''', webpage)
- if match:
- return self.url_result(self._proto_relative_url(
- 'wistia:%s' % match.group('id')), 'Wistia')
-
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
- # Look for embedded condenast player
- matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
- webpage)
- if matches:
- return {
- '_type': 'playlist',
- 'entries': [{
- '_type': 'url',
- 'ie_key': 'CondeNast',
- 'url': ma,
- } for ma in matches],
- 'title': video_title,
- 'id': video_id,
- }
-
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -2514,28 +2533,11 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
limelight_urls, video_id, video_title, video_description)
- mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
- if mobj:
- lm = {
- 'Media': 'media',
- 'Channel': 'channel',
- 'ChannelList': 'channel_list',
- }
- return self.url_result(smuggle_url('limelight:%s:%s' % (
- lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
- 'Limelight%s' % mobj.group(1), mobj.group(2))
-
- mobj = re.search(
- r'''(?sx)
- <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
- <param[^>]+
- name=(["\'])flashVars\2[^>]+
- value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
- ''', webpage)
- if mobj:
- return self.url_result(smuggle_url(
- 'limelight:media:%s' % mobj.group('id'),
- {'source_url': url}), 'LimelightMedia', mobj.group('id'))
+ # Look for Anvato embeds
+ anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+ if anvato_urls:
+ return self.playlist_result(
+ anvato_urls, video_id, video_title, video_description)
# Look for AdobeTVVideo embeds
mobj = re.search(
@@ -2654,6 +2656,18 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key())
+ # Look for WashingtonPost embeds
+ wapo_urls = WashingtonPostIE._extract_urls(webpage)
+ if wapo_urls:
+ return self.playlist_from_matches(
+ wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+ # Look for Mediaset embeds
+ mediaset_urls = MediasetIE._extract_urls(webpage)
+ if mediaset_urls:
+ return self.playlist_from_matches(
+ mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 4c9be47b4..9c7b1bd37 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -36,22 +36,26 @@ class GoIE(AdobePassIE):
'requestor_id': 'DisneyXD',
}
}
- _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
+ _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{
- 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
+ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
- 'id': '0_g86w5onx',
+ 'id': 'VDKA3807643',
'ext': 'mp4',
- 'title': 'Sneak Peek: Language Arts',
- 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c',
+ 'title': 'The Traitor in the White House',
+ 'description': 'md5:05b009d2d145a1e85d25111bd37222e8',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
- 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',
- 'only_matching': True,
+ 'url': 'http://watchdisneyxd.go.com/doraemon',
+ 'info_dict': {
+ 'title': 'Doraemon',
+ 'id': 'SH55574025',
+ },
+ 'playlist_mincount': 51,
}, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
@@ -60,19 +64,36 @@ class GoIE(AdobePassIE):
'only_matching': True,
}]
+ def _extract_videos(self, brand, video_id='-1', show_id='-1'):
+ display_id = video_id if video_id != '-1' else show_id
+ return self._download_json(
+ 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id),
+ display_id)['video']
+
def _real_extract(self, url):
sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
+ site_info = self._SITE_INFO[sub_domain]
+ brand = site_info['brand']
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
# There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
# from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id')
- site_info = self._SITE_INFO[sub_domain]
- brand = site_info['brand']
- video_data = self._download_json(
- 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id),
- video_id)['video'][0]
+ r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None)
+ if not video_id:
+ # show extraction works for Disney, DisneyJunior and DisneyXD
+ # ABC and Freeform has different layout
+ show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id')
+ videos = self._extract_videos(brand, show_id=show_id)
+ show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False)
+ entries = []
+ for video in videos:
+ entries.append(self.url_result(
+ video['url'], 'Go', video.get('id'), video.get('title')))
+ entries.reverse()
+ return self.playlist_result(entries, show_id, show_title)
+ video_data = self._extract_videos(brand, video_id)[0]
+ video_id = video_data['id']
title = video_data['title']
formats = []
@@ -105,7 +126,7 @@ class GoIE(AdobePassIE):
self._initialize_geo_bypass(['US'])
entitlement = self._download_json(
'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
- video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers())
+ video_id, data=urlencode_postdata(data))
errors = entitlement.get('errors', {}).get('errors', [])
if errors:
for error in errors:
diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py
index e21ebb8fb..1d905dc81 100644
--- a/youtube_dl/extractor/hitbox.py
+++ b/youtube_dl/extractor/hitbox.py
@@ -16,8 +16,8 @@ from ..utils import (
class HitboxIE(InfoExtractor):
IE_NAME = 'hitbox'
- _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?:[^/]+/)*videos?/(?P<id>[0-9]+)'
+ _TESTS = [{
'url': 'http://www.hitbox.tv/video/203213',
'info_dict': {
'id': '203213',
@@ -38,13 +38,15 @@ class HitboxIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'https://www.smashcast.tv/hitboxlive/videos/203213',
+ 'only_matching': True,
+ }]
def _extract_metadata(self, url, video_id):
thumb_base = 'https://edge.sf.hitbox.tv'
metadata = self._download_json(
- '%s/%s' % (url, video_id), video_id,
- 'Downloading metadata JSON')
+ '%s/%s' % (url, video_id), video_id, 'Downloading metadata JSON')
date = 'media_live_since'
media_type = 'livestream'
@@ -63,14 +65,15 @@ class HitboxIE(InfoExtractor):
views = int_or_none(video_meta.get('media_views'))
timestamp = parse_iso8601(video_meta.get(date), ' ')
categories = [video_meta.get('category_name')]
- thumbs = [
- {'url': thumb_base + video_meta.get('media_thumbnail'),
- 'width': 320,
- 'height': 180},
- {'url': thumb_base + video_meta.get('media_thumbnail_large'),
- 'width': 768,
- 'height': 432},
- ]
+ thumbs = [{
+ 'url': thumb_base + video_meta.get('media_thumbnail'),
+ 'width': 320,
+ 'height': 180
+ }, {
+ 'url': thumb_base + video_meta.get('media_thumbnail_large'),
+ 'width': 768,
+ 'height': 432
+ }]
return {
'id': video_id,
@@ -90,7 +93,7 @@ class HitboxIE(InfoExtractor):
video_id = self._match_id(url)
player_config = self._download_json(
- 'https://www.hitbox.tv/api/player/config/video/%s' % video_id,
+ 'https://www.smashcast.tv/api/player/config/video/%s' % video_id,
video_id, 'Downloading video JSON')
formats = []
@@ -121,8 +124,7 @@ class HitboxIE(InfoExtractor):
self._sort_formats(formats)
metadata = self._extract_metadata(
- 'https://www.hitbox.tv/api/media/video',
- video_id)
+ 'https://www.smashcast.tv/api/media/video', video_id)
metadata['formats'] = formats
return metadata
@@ -130,8 +132,8 @@ class HitboxIE(InfoExtractor):
class HitboxLiveIE(HitboxIE):
IE_NAME = 'hitbox:live'
- _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P<id>.+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)'
+ _TESTS = [{
'url': 'http://www.hitbox.tv/dimak',
'info_dict': {
'id': 'dimak',
@@ -146,13 +148,20 @@ class HitboxLiveIE(HitboxIE):
# live
'skip_download': True,
},
- }
+ }, {
+ 'url': 'https://www.smashcast.tv/dimak',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if HitboxIE.suitable(url) else super(HitboxLiveIE, cls).suitable(url)
def _real_extract(self, url):
video_id = self._match_id(url)
player_config = self._download_json(
- 'https://www.hitbox.tv/api/player/config/live/%s' % video_id,
+ 'https://www.smashcast.tv/api/player/config/live/%s' % video_id,
video_id)
formats = []
@@ -197,8 +206,7 @@ class HitboxLiveIE(HitboxIE):
self._sort_formats(formats)
metadata = self._extract_metadata(
- 'https://www.hitbox.tv/api/media/live',
- video_id)
+ 'https://www.smashcast.tv/api/media/live', video_id)
metadata['formats'] = formats
metadata['is_live'] = True
metadata['title'] = self._live_title(metadata.get('title'))
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f95c00c73..3ff672a89 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -13,7 +13,7 @@ from ..utils import (
class ImdbIE(InfoExtractor):
IE_NAME = 'imdb'
IE_DESC = 'Internet Movie Database trailers'
- _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
@@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor):
}, {
'url': 'http://www.imdb.com/videoplayer/vi1562949145',
'only_matching': True,
+ }, {
+ 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 9fb71e8ef..fe425e786 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE):
def _extract_http_audio(self, webpage, video_id):
fields = self._hidden_inputs(webpage)
- http_audio_url = fields['filename']
- if http_audio_url is None:
+ http_audio_url = fields.get('filename')
+ if not http_audio_url:
return []
cookies_header = {'Cookie': self._extract_cookies(webpage)}
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 3190b187c..1f91ba017 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -8,15 +10,15 @@ from ..utils import (
urlencode_postdata,
xpath_element,
xpath_text,
- urljoin,
update_url_query,
+ js_to_json,
)
class Laola1TvEmbedIE(InfoExtractor):
IE_NAME = 'laola1tv:embed'
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
# flashvars.premium = "false";
'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024',
'info_dict': {
@@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor):
'uploader': 'ITTF - International Table Tennis Federation',
'upload_date': '20161211',
},
- }
+ }]
+
+ def _extract_token_url(self, stream_access_url, video_id, data):
+ return self._download_json(
+ stream_access_url, video_id, headers={
+ 'Content-Type': 'application/json',
+ }, data=json.dumps(data).encode())['data']['stream-access'][0]
+
+ def _extract_formats(self, token_url, video_id):
+ token_doc = self._download_xml(
+ token_url, video_id, 'Downloading token',
+ headers=self.geo_verification_headers())
+
+ token_attrib = xpath_element(token_doc, './/token').attrib
+
+ if token_attrib['status'] != '0':
+ raise ExtractorError(
+ 'Token error: %s' % token_attrib['comment'], expected=True)
+
+ formats = self._extract_akamai_formats(
+ '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
+ video_id)
+ self._sort_formats(formats)
+ return formats
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor):
else:
data_abo = urlencode_postdata(
dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))
- token_url = self._download_json(
- 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access',
- video_id, query={
+ stream_access_url = update_url_query(
+ 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', {
'videoId': _v('id'),
'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'),
'label': _v('label'),
'area': _v('area'),
- }, data=data_abo)['data']['stream-access'][0]
-
- token_doc = self._download_xml(
- token_url, video_id, 'Downloading token',
- headers=self.geo_verification_headers())
-
- token_attrib = xpath_element(token_doc, './/token').attrib
-
- if token_attrib['status'] != '0':
- raise ExtractorError(
- 'Token error: %s' % token_attrib['comment'], expected=True)
+ })
+ token_url = self._extract_token_url(stream_access_url, video_id, data_abo)
- formats = self._extract_akamai_formats(
- '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
- video_id)
- self._sort_formats(formats)
+ formats = self._extract_formats(token_url, video_id)
categories_str = _v('meta_sports')
categories = categories_str.split(',') if categories_str else []
@@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor):
}
-class Laola1TvIE(InfoExtractor):
+class Laola1TvIE(Laola1TvEmbedIE):
IE_NAME = 'laola1tv'
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)'
_TESTS = [{
@@ -164,13 +176,42 @@ class Laola1TvIE(InfoExtractor):
if 'Dieser Livestream ist bereits beendet.' in webpage:
raise ExtractorError('This live stream has already finished.', expected=True)
- iframe_url = urljoin(url, self._search_regex(
- r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"',
- webpage, 'iframe url'))
+ conf = self._parse_json(self._search_regex(
+ r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'),
+ display_id, js_to_json)
+
+ video_id = conf['videoid']
+
+ config = self._download_json(conf['configUrl'], video_id, query={
+ 'videoid': video_id,
+ 'partnerid': conf['partnerid'],
+ 'language': conf.get('language', ''),
+ 'portal': conf.get('portalid', ''),
+ })
+ error = config.get('error')
+ if error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+ video_data = config['video']
+ title = video_data['title']
+ is_live = video_data.get('isLivestream') and video_data.get('isLive')
+ meta = video_data.get('metaInformation')
+ sports = meta.get('sports')
+ categories = sports.split(',') if sports else []
+
+ token_url = self._extract_token_url(
+ video_data['streamAccess'], video_id,
+ video_data['abo']['required'])
+
+ formats = self._extract_formats(token_url, video_id)
return {
- '_type': 'url',
+ 'id': video_id,
'display_id': display_id,
- 'url': iframe_url,
- 'ie_key': 'Laola1TvEmbed',
+ 'title': self._live_title(title) if is_live else title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('image'),
+ 'categories': categories,
+ 'formats': formats,
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py
index 9eda956d2..0a07c1320 100644
--- a/youtube_dl/extractor/leeco.py
+++ b/youtube_dl/extractor/leeco.py
@@ -23,7 +23,6 @@ from ..utils import (
str_or_none,
url_basename,
urshift,
- update_url_query,
)
@@ -51,7 +50,7 @@ class LeIE(InfoExtractor):
'id': '1415246',
'ext': 'mp4',
'title': '美人天下01',
- 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+ 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
},
'params': {
'hls_prefer_native': True,
@@ -69,7 +68,6 @@ class LeIE(InfoExtractor):
'params': {
'hls_prefer_native': True,
},
- 'skip': 'Only available in China',
}, {
'url': 'http://sports.le.com/video/25737697.html',
'only_matching': True,
@@ -81,7 +79,7 @@ class LeIE(InfoExtractor):
'only_matching': True,
}]
- # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+ # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
def ror(self, param1, param2):
_loc3_ = 0
while _loc3_ < param2:
@@ -90,15 +88,8 @@ class LeIE(InfoExtractor):
return param1
def calc_time_key(self, param1):
- _loc2_ = 773625421
- _loc3_ = self.ror(param1, _loc2_ % 13)
- _loc3_ = _loc3_ ^ _loc2_
- _loc3_ = self.ror(_loc3_, _loc2_ % 17)
- return _loc3_
-
- # reversed from http://jstatic.letvcdn.com/sdk/player.js
- def get_mms_key(self, time):
- return self.ror(time, 8) ^ 185025305
+ _loc2_ = 185025305
+ return self.ror(param1, _loc2_ % 17) ^ _loc2_
# see M3U8Encryption class in KLetvPlayer.swf
@staticmethod
@@ -122,7 +113,7 @@ class LeIE(InfoExtractor):
def _check_errors(self, play_json):
# Check for errors
- playstatus = play_json['playstatus']
+ playstatus = play_json['msgs']['playstatus']
if playstatus['status'] == 0:
flag = playstatus['flag']
if flag == 1:
@@ -134,58 +125,31 @@ class LeIE(InfoExtractor):
media_id = self._match_id(url)
page = self._download_webpage(url, media_id)
- play_json_h5 = self._download_json(
- 'http://api.le.com/mms/out/video/playJsonH5',
- media_id, 'Downloading html5 playJson data', query={
- 'id': media_id,
- 'platid': 3,
- 'splatid': 304,
- 'format': 1,
- 'tkey': self.get_mms_key(int(time.time())),
- 'domain': 'www.le.com',
- 'tss': 'no',
- },
- headers=self.geo_verification_headers())
- self._check_errors(play_json_h5)
-
play_json_flash = self._download_json(
- 'http://api.le.com/mms/out/video/playJson',
+ 'http://player-pc.le.com/mms/out/video/playJson',
media_id, 'Downloading flash playJson data', query={
'id': media_id,
'platid': 1,
'splatid': 101,
'format': 1,
+ 'source': 1000,
'tkey': self.calc_time_key(int(time.time())),
'domain': 'www.le.com',
+ 'region': 'cn',
},
headers=self.geo_verification_headers())
self._check_errors(play_json_flash)
- def get_h5_urls(media_url, format_id):
- location = self._download_json(
- media_url, media_id,
- 'Download JSON metadata for format %s' % format_id, query={
- 'format': 1,
- 'expect': 3,
- 'tss': 'no',
- })['location']
-
- return {
- 'http': update_url_query(location, {'tss': 'no'}),
- 'hls': update_url_query(location, {'tss': 'ios'}),
- }
-
def get_flash_urls(media_url, format_id):
- media_url += '&' + compat_urllib_parse_urlencode({
- 'm3v': 1,
- 'format': 1,
- 'expect': 3,
- 'rateid': format_id,
- })
-
nodes_data = self._download_json(
media_url, media_id,
- 'Download JSON metadata for format %s' % format_id)
+ 'Download JSON metadata for format %s' % format_id,
+ query={
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'tss': 'ios',
+ })
req = self._request_webpage(
nodes_data['nodelist'][0]['location'], media_id,
@@ -199,29 +163,28 @@ class LeIE(InfoExtractor):
extracted_formats = []
formats = []
- for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
- playurl = play_json['playurl']
- play_domain = playurl['domain'][0]
-
- for format_id, format_data in playurl.get('dispatch', []).items():
- if format_id in extracted_formats:
- continue
- extracted_formats.append(format_id)
-
- media_url = play_domain + format_data[0]
- for protocol, format_url in get_urls(media_url, format_id).items():
- f = {
- 'url': format_url,
- 'ext': determine_ext(format_data[1]),
- 'format_id': '%s-%s' % (protocol, format_id),
- 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
- 'quality': int_or_none(format_id),
- }
-
- if format_id[-1:] == 'p':
- f['height'] = int_or_none(format_id[:-1])
-
- formats.append(f)
+ playurl = play_json_flash['msgs']['playurl']
+ play_domain = playurl['domain'][0]
+
+ for format_id, format_data in playurl.get('dispatch', []).items():
+ if format_id in extracted_formats:
+ continue
+ extracted_formats.append(format_id)
+
+ media_url = play_domain + format_data[0]
+ for protocol, format_url in get_flash_urls(media_url, format_id).items():
+ f = {
+ 'url': format_url,
+ 'ext': determine_ext(format_data[1]),
+ 'format_id': '%s-%s' % (protocol, format_id),
+ 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+ 'quality': int_or_none(format_id),
+ }
+
+ if format_id[-1:] == 'p':
+ f['height'] = int_or_none(format_id[:-1])
+
+ formats.append(f)
self._sort_formats(formats, ('height', 'quality', 'format_id'))
publish_time = parse_iso8601(self._html_search_regex(
diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py
index d3bca6435..b312e77f1 100644
--- a/youtube_dl/extractor/lego.py
+++ b/youtube_dl/extractor/lego.py
@@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor):
formats = self._extract_akamai_formats(
'%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none',
formats))
if len(m3u8_formats) == len(self._BITRATES):
self._sort_formats(m3u8_formats)
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index c7de65353..c54519636 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
@@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
- 'md5': '50f79e05ba149149c1b4ea961223d5b3',
+ 'md5': '0813c2430bea7a46bf13acf3406992f4',
'info_dict': {
'id': '757_1364311680',
- 'ext': 'flv',
+ 'ext': 'mp4',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident',
@@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor):
}
}, {
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
- 'md5': 'b13a29626183c9d33944e6a04f41aafc',
+ 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
'info_dict': {
'id': 'f93_1390833151',
'ext': 'mp4',
@@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
+ # Prochan embed
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
'md5': '42c6d97d54f1db107958760788c5f48f',
'info_dict': {
@@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor):
'uploader': 'CapObveus',
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
- }
+ },
+ 'skip': 'Video is dead',
}, {
# Covers https://github.com/rg3/youtube-dl/pull/5983
+ # Multiple resolutions
'url': 'http://www.liveleak.com/view?i=801_1409392012',
- 'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+ 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
'info_dict': {
'id': '801_1409392012',
'ext': 'mp4',
@@ -93,57 +95,38 @@ class LiveLeakIE(InfoExtractor):
webpage, 'age limit', default=None))
video_thumbnail = self._og_search_thumbnail(webpage)
- sources_raw = self._search_regex(
- r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
- if sources_raw is None:
- alt_source = self._search_regex(
- r'(file: ".*?"),', webpage, 'video URL', default=None)
- if alt_source:
- sources_raw = '[{ %s}]' % alt_source
- else:
- # Maybe an embed?
- embed_url = self._search_regex(
- r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
- webpage, 'embed URL')
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- }
+ entries = self._parse_html5_media_entries(url, webpage, video_id)
+ if not entries:
+ # Maybe an embed?
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
+ webpage, 'embed URL')
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ 'id': video_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'age_limit': age_limit,
+ }
- sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
- sources = json.loads(sources_json)
+ info_dict = entries[0]
- formats = [{
- 'format_id': '%s' % i,
- 'format_note': s.get('label'),
- 'url': s['file'],
- } for i, s in enumerate(sources)]
+ for a_format in info_dict['formats']:
+ if not a_format.get('height'):
+ a_format['height'] = self._search_regex(
+ r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)
- for i, s in enumerate(sources):
- # Removing '.h264_*.mp4' gives the raw video, which is essentially
- # the same video without the LiveLeak logo at the top (see
- # https://github.com/rg3/youtube-dl/pull/4768)
- orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
- if s['file'] != orig_url:
- formats.append({
- 'format_id': 'original-%s' % i,
- 'format_note': s.get('label'),
- 'url': orig_url,
- 'preference': 1,
- })
- self._sort_formats(formats)
+ self._sort_formats(info_dict['formats'])
- return {
+ info_dict.update({
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
- 'formats': formats,
'age_limit': age_limit,
'thumbnail': video_thumbnail,
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py
new file mode 100644
index 000000000..9760eafd5
--- /dev/null
+++ b/youtube_dl/extractor/mediaset.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ parse_duration,
+ try_get,
+ unified_strdate,
+)
+
+
+class MediasetIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ mediaset:|
+ https?://
+ (?:www\.)?video\.mediaset\.it/
+ (?:
+ (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
+ player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
+ )
+ )(?P<id>[0-9]+)
+ '''
+ _TESTS = [{
+ # full episode
+ 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
+ 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
+ 'info_dict': {
+ 'id': '661824',
+ 'ext': 'mp4',
+ 'title': 'Quarta puntata',
+ 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1414,
+ 'creator': 'mediaset',
+ 'upload_date': '20161107',
+ 'series': 'Hello Goodbye',
+ 'categories': ['reality'],
+ },
+ 'expected_warnings': ['is not a supported codec'],
+ }, {
+ # clip
+ 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
+ 'only_matching': True,
+ }, {
+ # iframe simple
+ 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
+ 'only_matching': True,
+ }, {
+ # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
+ 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
+ 'only_matching': True,
+ }, {
+ 'url': 'mediaset:661824',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video_list = self._download_json(
+ 'http://cdnsel01.mediaset.net/GetCdn.aspx',
+ video_id, 'Downloading video CDN JSON', query={
+ 'streamid': video_id,
+ 'format': 'json',
+ })['videoList']
+
+ formats = []
+ for format_url in video_list:
+ if '.ism' in format_url:
+ formats.extend(self._extract_ism_formats(
+ format_url, video_id, ism_id='mss', fatal=False))
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': determine_ext(format_url),
+ })
+ self._sort_formats(formats)
+
+ mediainfo = self._download_json(
+ 'http://plr.video.mediaset.it/html/metainfo.sjson',
+ video_id, 'Downloading video info JSON', query={
+ 'id': video_id,
+ })['video']
+
+ title = mediainfo['title']
+
+ creator = try_get(
+ mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
+ category = try_get(
+ mediainfo, lambda x: x['brand-info']['category'], compat_str)
+ categories = [category] if category else None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': mediainfo.get('short-description'),
+ 'thumbnail': mediainfo.get('thumbnail'),
+ 'duration': parse_duration(mediainfo.get('duration')),
+ 'creator': creator,
+ 'upload_date': unified_strdate(mediainfo.get('production-date')),
+ 'webpage_url': mediainfo.get('url'),
+ 'series': mediainfo.get('brand-value'),
+ 'categories': categories,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py
index 28b743cca..964dc542c 100644
--- a/youtube_dl/extractor/mitele.py
+++ b/youtube_dl/extractor/mitele.py
@@ -136,11 +136,9 @@ class MiTeleIE(InfoExtractor):
video_id, 'Downloading gigya script')
# Get a appKey/uuid for getting the session key
- appKey_var = self._search_regex(
- r'value\s*\(\s*["\']appGridApplicationKey["\']\s*,\s*([0-9a-f]+)',
- gigya_sc, 'appKey variable')
appKey = self._search_regex(
- r'var\s+%s\s*=\s*["\']([0-9a-f]+)' % appKey_var, gigya_sc, 'appKey')
+ r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)',
+ gigya_sc, 'appKey')
session_json = self._download_json(
'https://appgrid-api.cloud.accedo.tv/session',
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
index f281238c9..e164d5940 100644
--- a/youtube_dl/extractor/myspace.py
+++ b/youtube_dl/extractor/myspace.py
@@ -12,64 +12,62 @@ from ..utils import (
class MySpaceIE(InfoExtractor):
- _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ myspace\.com/[^/]+/
+ (?P<mediatype>
+ video/[^/]+/(?P<video_id>\d+)|
+ music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$)
+ )
+ '''
- _TESTS = [
- {
- 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
- 'md5': '9c1483c106f4a695c47d2911feed50a7',
- 'info_dict': {
- 'id': '109594919',
- 'ext': 'mp4',
- 'title': 'Little Big Town',
- 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
- 'uploader': 'Five Minutes to the Stage',
- 'uploader_id': 'fiveminutestothestage',
- 'timestamp': 1414108751,
- 'upload_date': '20141023',
- },
+ _TESTS = [{
+ 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
+ 'md5': '9c1483c106f4a695c47d2911feed50a7',
+ 'info_dict': {
+ 'id': '109594919',
+ 'ext': 'mp4',
+ 'title': 'Little Big Town',
+ 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
+ 'uploader': 'Five Minutes to the Stage',
+ 'uploader_id': 'fiveminutestothestage',
+ 'timestamp': 1414108751,
+ 'upload_date': '20141023',
},
+ }, {
# songs
- {
- 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
- 'md5': '1d7ee4604a3da226dd69a123f748b262',
- 'info_dict': {
- 'id': '93388656',
- 'ext': 'm4a',
- 'title': 'Of weakened soul...',
- 'uploader': 'Killsorrow',
- 'uploader_id': 'killsorrow',
- },
- }, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
- 'info_dict': {
- 'id': 'xqds0B_meys',
- 'ext': 'webm',
- 'title': 'Three Days Grace - Animal I Have Become',
- 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
- 'uploader': 'ThreeDaysGraceVEVO',
- 'uploader_id': 'ThreeDaysGraceVEVO',
- 'upload_date': '20091002',
- },
- }, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
- 'info_dict': {
- 'id': 'ypWvQgnJrSU',
- 'ext': 'mp4',
- 'title': 'Starset - First Light',
- 'description': 'md5:2d5db6c9d11d527683bcda818d332414',
- 'uploader': 'Yumi K',
- 'uploader_id': 'SorenPromotions',
- 'upload_date': '20140725',
- }
+ 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
+ 'md5': '1d7ee4604a3da226dd69a123f748b262',
+ 'info_dict': {
+ 'id': '93388656',
+ 'ext': 'm4a',
+ 'title': 'Of weakened soul...',
+ 'uploader': 'Killsorrow',
+ 'uploader_id': 'killsorrow',
},
- ]
+ }, {
+ 'add_ie': ['Youtube'],
+ 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
+ 'info_dict': {
+ 'id': 'xqds0B_meys',
+ 'ext': 'webm',
+ 'title': 'Three Days Grace - Animal I Have Become',
+ 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
+ 'uploader': 'ThreeDaysGraceVEVO',
+ 'uploader_id': 'ThreeDaysGraceVEVO',
+ 'upload_date': '20091002',
+ },
+ }, {
+ 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.group('video_id') or mobj.group('song_id')
is_song = mobj.group('mediatype').startswith('music/song')
webpage = self._download_webpage(url, video_id)
player_url = self._search_regex(
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index d2a44d05d..62db70b43 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -5,10 +5,8 @@ import re
from .common import InfoExtractor
from .theplatform import ThePlatformIE
from .adobepass import AdobePassIE
-from ..compat import compat_urllib_parse_urlparse
from ..utils import (
find_xpath_attr,
- lowercase_escape,
smuggle_url,
unescapeHTML,
update_url_query,
@@ -17,7 +15,7 @@ from ..utils import (
class NBCIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+ _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))'
_TESTS = [
{
@@ -37,16 +35,6 @@ class NBCIE(AdobePassIE):
},
},
{
- 'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
- 'info_dict': {
- 'id': '176',
- 'ext': 'flv',
- 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
- 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
- },
- 'skip': '404 Not Found',
- },
- {
'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
'info_dict': {
'id': '2832821',
@@ -64,11 +52,6 @@ class NBCIE(AdobePassIE):
'skip': 'Only works from US',
},
{
- # This video has expired but with an escaped embedURL
- 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
- 'only_matching': True,
- },
- {
# HLS streams requires the 'hdnea3' cookie
'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
'info_dict': {
@@ -88,59 +71,38 @@ class NBCIE(AdobePassIE):
]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- info = {
+ permalink, video_id = re.match(self._VALID_URL, url).groups()
+ video_data = self._download_json(
+ 'https://api.nbc.com/v3/videos', video_id, query={
+ 'filter[permalink]': permalink,
+ })['data'][0]['attributes']
+ query = {
+ 'mbr': 'true',
+ 'manifest': 'm3u',
+ }
+ video_id = video_data['guid']
+ title = video_data['title']
+ if video_data.get('entitlement') == 'auth':
+ resource = self._get_mvpd_resource(
+ 'nbcentertainment', title, video_id,
+ video_data.get('vChipRating'))
+ query['auth'] = self._extract_mvpd_auth(
+ url, video_id, 'nbcentertainment', resource)
+ theplatform_url = smuggle_url(update_url_query(
+ 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
+ query), {'force_smil_url': True})
+ return {
'_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
'id': video_id,
+ 'title': title,
+ 'url': theplatform_url,
+ 'description': video_data.get('description'),
+ 'keywords': video_data.get('keywords'),
+ 'season_number': int_or_none(video_data.get('seasonNumber')),
+ 'episode_number': int_or_none(video_data.get('episodeNumber')),
+ 'series': video_data.get('showName'),
+ 'ie_key': 'ThePlatform',
}
- video_data = None
- preload = self._search_regex(
- r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None)
- if preload:
- preload_data = self._parse_json(preload, video_id)
- path = compat_urllib_parse_urlparse(url).path.rstrip('/')
- entity_id = preload_data.get('xref', {}).get(path)
- video_data = preload_data.get('entities', {}).get(entity_id)
- if video_data:
- query = {
- 'mbr': 'true',
- 'manifest': 'm3u',
- }
- video_id = video_data['guid']
- title = video_data['title']
- if video_data.get('entitlement') == 'auth':
- resource = self._get_mvpd_resource(
- 'nbcentertainment', title, video_id,
- video_data.get('vChipRating'))
- query['auth'] = self._extract_mvpd_auth(
- url, video_id, 'nbcentertainment', resource)
- theplatform_url = smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
- query), {'force_smil_url': True})
- info.update({
- 'id': video_id,
- 'title': title,
- 'url': theplatform_url,
- 'description': video_data.get('description'),
- 'keywords': video_data.get('keywords'),
- 'season_number': int_or_none(video_data.get('seasonNumber')),
- 'episode_number': int_or_none(video_data.get('episodeNumber')),
- 'series': video_data.get('showName'),
- })
- else:
- theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
- [
- r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
- r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
- r'"embedURL"\s*:\s*"([^"]+)"'
- ],
- webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
- if theplatform_url.startswith('//'):
- theplatform_url = 'http:' + theplatform_url
- info['url'] = smuggle_url(theplatform_url, {'source_url': url})
- return info
class NBCSportsVPlayerIE(InfoExtractor):
diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py
index f5e3f6815..9b5ad5a9f 100644
--- a/youtube_dl/extractor/njpwworld.py
+++ b/youtube_dl/extractor/njpwworld.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
+ extract_attributes,
get_element_by_class,
urlencode_postdata,
)
@@ -56,17 +57,24 @@ class NJPWWorldIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
formats = []
- for player_url, kind in re.findall(r'<a[^>]+href="(/player[^"]+)".+?<img[^>]+src="[^"]+qf_btn_([^".]+)', webpage):
- player_url = compat_urlparse.urljoin(url, player_url)
-
+ for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage):
+ player = extract_attributes(mobj.group(0))
+ player_path = player.get('href')
+ if not player_path:
+ continue
+ kind = self._search_regex(
+ r'(low|high)$', player.get('class') or '', 'kind',
+ default='low')
+ player_url = compat_urlparse.urljoin(url, player_path)
player_page = self._download_webpage(
player_url, video_id, note='Downloading player page')
-
entries = self._parse_html5_media_entries(
player_url, player_page, video_id, m3u8_id='hls-%s' % kind,
- m3u8_entry_protocol='m3u8_native',
- preference=2 if 'hq' in kind else 1)
- formats.extend(entries[0]['formats'])
+ m3u8_entry_protocol='m3u8_native')
+ kind_formats = entries[0]['formats']
+ for f in kind_formats:
+ f['quality'] = 2 if kind == 'high' else 1
+ formats.extend(kind_formats)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py
new file mode 100644
index 000000000..63e58aae2
--- /dev/null
+++ b/youtube_dl/extractor/nonktube.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .nuevo import NuevoBaseIE
+
+
+class NonkTubeIE(NuevoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized',
+ 'info_dict': {
+ 'id': '118636',
+ 'ext': 'mp4',
+ 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized',
+ 'age_limit': 18,
+ 'duration': 1150.98,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://www.nonktube.com/embed/118636',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._extract_nuevo(
+ 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s'
+ % video_id, video_id)
+
+ info['age_limit'] = 18
+ return info
diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py
new file mode 100644
index 000000000..f7fa098a5
--- /dev/null
+++ b/youtube_dl/extractor/noovo.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ smuggle_url,
+ try_get,
+)
+
+
+class NoovoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)'
+ _TESTS = [{
+ # clip
+ 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
+ 'info_dict': {
+ 'id': '5386045029001',
+ 'ext': 'mp4',
+ 'title': 'Chrysler Imperial',
+ 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
+ 'timestamp': 1491399228,
+ 'upload_date': '20170405',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': 'RPM+',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # episode
+ 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
+ 'info_dict': {
+ 'id': '5395865725001',
+ 'title': 'Épisode 13 : Les retrouvailles',
+ 'description': 'md5:336d5ebc5436534e61d16e63ddfca327',
+ 'ext': 'mp4',
+ 'timestamp': 1492019320,
+ 'upload_date': '20170412',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': "L'amour est dans le pré",
+ 'season_number': 5,
+ 'episode': 'Épisode 13',
+ 'episode_number': 13,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id,
+ video_id)['data']
+
+ content = try_get(data, lambda x: x['contents'][0])
+
+ brightcove_id = data.get('brightcoveId') or content['brightcoveId']
+
+ series = try_get(
+ data, (
+ lambda x: x['show']['title'],
+ lambda x: x['season']['show']['title']),
+ compat_str)
+
+ episode = None
+ og = data.get('og')
+ if isinstance(og, dict) and og.get('type') == 'video.episode':
+ episode = og.get('title')
+
+ video = content or data
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': BrightcoveNewIE.ie_key(),
+ 'url': smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['CA']}),
+ 'id': brightcove_id,
+ 'title': video.get('title'),
+ 'creator': video.get('source'),
+ 'view_count': int_or_none(video.get('viewsCount')),
+ 'series': series,
+ 'season_number': int_or_none(try_get(
+ data, lambda x: x['season']['seasonNumber'])),
+ 'episode': episode,
+ 'episode_number': int_or_none(data.get('episodeNumber')),
+ }
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 7fe79cb53..3b4f51f61 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor):
vcodec = 'none' if data.get('mediaType') == 'Audio' else None
- # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
-
for entry in entries:
entry.update(common_info)
for f in entry['formats']:
f['vcodec'] = vcodec
+ points = data.get('shortIndexPoints')
+ if isinstance(points, list):
+ chapters = []
+ for next_num, point in enumerate(points, start=1):
+ if not isinstance(point, dict):
+ continue
+ start_time = parse_duration(point.get('startPoint'))
+ if start_time is None:
+ continue
+ end_time = parse_duration(
+ data.get('duration')
+ if next_num == len(points)
+ else points[next_num].get('startPoint'))
+ if end_time is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': point.get('title'),
+ })
+ if chapters and len(entries) == 1:
+ entries[0]['chapters'] = chapters
+
return self.playlist_result(entries, video_id, title, description)
diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py
index 87fb94d1f..be1e09d37 100644
--- a/youtube_dl/extractor/nuevo.py
+++ b/youtube_dl/extractor/nuevo.py
@@ -10,9 +10,10 @@ from ..utils import (
class NuevoBaseIE(InfoExtractor):
- def _extract_nuevo(self, config_url, video_id):
+ def _extract_nuevo(self, config_url, video_id, headers={}):
config = self._download_xml(
- config_url, video_id, transform_source=lambda s: s.strip())
+ config_url, video_id, transform_source=lambda s: s.strip(),
+ headers=headers)
title = xpath_text(config, './title', 'title', fatal=True).strip()
video_id = xpath_text(config, './mediaid', default=video_id)
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 1e2c54e68..cc296eabd 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -2,8 +2,6 @@
from __future__ import unicode_literals
import re
-import calendar
-import datetime
from .common import InfoExtractor
from ..compat import compat_str
@@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor):
}
-class ORFOE1IE(InfoExtractor):
- IE_NAME = 'orf:oe1'
- IE_DESC = 'Radio Österreich 1'
- _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)'
-
- # Audios on ORF radio are only available for 7 days, so we can't add tests.
- _TESTS = [{
- 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211',
- 'only_matching': True,
- }, {
- 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- show_id = self._match_id(url)
- data = self._download_json(
- 'http://oe1.orf.at/programm/%s/konsole' % show_id,
- show_id
- )
-
- timestamp = datetime.datetime.strptime('%s %s' % (
- data['item']['day_label'],
- data['item']['time']
- ), '%d.%m.%Y %H:%M')
- unix_timestamp = calendar.timegm(timestamp.utctimetuple())
-
- return {
- 'id': show_id,
- 'title': data['item']['title'],
- 'url': data['item']['url_stream'],
- 'ext': 'mp3',
- 'description': data['item'].get('info'),
- 'timestamp': unix_timestamp
- }
-
-
-class ORFFM4IE(InfoExtractor):
- IE_NAME = 'orf:fm4'
- IE_DESC = 'radio FM4'
- _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
-
- _TEST = {
- 'url': 'http://fm4.orf.at/player/20160110/IS/',
- 'md5': '01e736e8f1cef7e13246e880a59ad298',
- 'info_dict': {
- 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244',
- 'ext': 'mp3',
- 'title': 'Im Sumpf',
- 'description': 'md5:384c543f866c4e422a55f66a62d669cd',
- 'duration': 7173,
- 'timestamp': 1452456073,
- 'upload_date': '20160110',
- },
- 'skip': 'Live streams on FM4 got deleted soon',
- }
-
+class ORFRadioIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ station = mobj.group('station')
show_date = mobj.group('date')
show_id = mobj.group('show')
+ if station == 'fm4':
+ show_id = '4%s' % show_id
+
data = self._download_json(
- 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id),
+ 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date),
show_id
)
def extract_entry_dict(info, title, subtitle):
return {
'id': info['loopStreamId'].replace('.mp3', ''),
- 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'],
+ 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']),
'title': title,
'description': subtitle,
'duration': (info['end'] - info['start']) / 1000,
@@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor):
}
+class ORFFM4IE(ORFRadioIE):
+ IE_NAME = 'orf:fm4'
+ IE_DESC = 'radio FM4'
+ _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://fm4.orf.at/player/20170107/CC',
+ 'md5': '2b0be47375432a7ef104453432a19212',
+ 'info_dict': {
+ 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
+ 'ext': 'mp3',
+ 'title': 'Solid Steel Radioshow',
+ 'description': 'Die Mixshow von Coldcut und Ninja Tune.',
+ 'duration': 3599,
+ 'timestamp': 1483819257,
+ 'upload_date': '20170107',
+ },
+ 'skip': 'Shows from ORF radios are only available for 7 days.'
+ }
+
+
+class ORFOE1IE(ORFRadioIE):
+ IE_NAME = 'orf:oe1'
+ IE_DESC = 'Radio Österreich 1'
+ _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://oe1.orf.at/player/20170108/456544',
+ 'md5': '34d8a6e67ea888293741c86a099b745b',
+ 'info_dict': {
+ 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
+ 'ext': 'mp3',
+ 'title': 'Morgenjournal',
+ 'duration': 609,
+ 'timestamp': 1483858796,
+ 'upload_date': '20170108',
+ },
+ 'skip': 'Shows from ORF radios are only available for 7 days.'
+ }
+
+
class ORFIPTVIE(InfoExtractor):
IE_NAME = 'orf:iptv'
IE_DESC = 'iptv.ORF.at'
diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py
index 881f3bcc7..bb668c999 100644
--- a/youtube_dl/extractor/packtpub.py
+++ b/youtube_dl/extractor/packtpub.py
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+ compat_str,
+ compat_HTTPError,
+)
from ..utils import (
clean_html,
ExtractorError,
@@ -11,6 +14,7 @@ from ..utils import (
strip_or_none,
unified_timestamp,
urljoin,
+ urlencode_postdata,
)
@@ -34,6 +38,32 @@ class PacktPubIE(PacktPubBaseIE):
'upload_date': '20170331',
},
}
+ _NETRC_MACHINE = 'packtpub'
+ _TOKEN = None
+
+ def _real_initialize(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ webpage = self._download_webpage(self._PACKT_BASE, None)
+ login_form = self._form_hidden_inputs(
+ 'packt-user-login-form', webpage)
+ login_form.update({
+ 'email': username,
+ 'password': password,
+ })
+ self._download_webpage(
+ self._PACKT_BASE, None, 'Logging in as %s' % username,
+ data=urlencode_postdata(login_form))
+ try:
+ self._TOKEN = self._download_json(
+ '%s/users/tokens/sessions' % self._MAPT_REST, None,
+ 'Downloading Authorization Token')['data']['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 404):
+ message = self._parse_json(e.cause.read().decode(), None)['message']
+ raise ExtractorError(message, expected=True)
+ raise
def _handle_error(self, response):
if response.get('status') != 'success':
@@ -51,14 +81,17 @@ class PacktPubIE(PacktPubBaseIE):
course_id, chapter_id, video_id = mobj.group(
'course_id', 'chapter_id', 'id')
+ headers = {}
+ if self._TOKEN:
+ headers['Authorization'] = self._TOKEN
video = self._download_json(
'%s/users/me/products/%s/chapters/%s/sections/%s'
% (self._MAPT_REST, course_id, chapter_id, video_id), video_id,
- 'Downloading JSON video')['data']
+ 'Downloading JSON video', headers=headers)['data']
content = video.get('content')
if not content:
- raise ExtractorError('This video is locked', expected=True)
+ self.raise_login_required('This video is locked')
video_url = content['file']
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 3e51b4dd7..16cc667d0 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -8,7 +8,9 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
+ float_or_none,
js_to_json,
+ orderedSet,
strip_jsonp,
strip_or_none,
unified_strdate,
@@ -264,6 +266,13 @@ class PBSIE(InfoExtractor):
'playlist_count': 2,
},
{
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+ 'info_dict': {
+ 'id': 'great-war',
+ },
+ 'playlist_count': 3,
+ },
+ {
'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
'info_dict': {
'id': '2276541483',
@@ -381,10 +390,10 @@ class PBSIE(InfoExtractor):
# tabbed frontline videos
MULTI_PART_REGEXES = (
r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
- r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+ r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
)
for p in MULTI_PART_REGEXES:
- tabbed_videos = re.findall(p, webpage)
+ tabbed_videos = orderedSet(re.findall(p, webpage))
if tabbed_videos:
return tabbed_videos, presumptive_id, upload_date, description
@@ -464,6 +473,7 @@ class PBSIE(InfoExtractor):
redirects.append(redirect)
redirect_urls.add(redirect_url)
+ chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
@@ -479,6 +489,20 @@ class PBSIE(InfoExtractor):
extract_redirect_urls(video_info)
if not info:
info = video_info
+ if not chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ start_time = float_or_none(chapter.get('start_time'), 1000)
+ duration = float_or_none(chapter.get('duration'), 1000)
+ if start_time is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': chapter.get('title'),
+ })
formats = []
http_url = None
@@ -515,7 +539,7 @@ class PBSIE(InfoExtractor):
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
@@ -588,4 +612,5 @@ class PBSIE(InfoExtractor):
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
+ 'chapters': chapters,
}
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index b25f1f193..1dcc8df00 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+ (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/
)
(?P<id>[\da-z]+)
@@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor):
}, {
'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
'only_matching': True,
+ }, {
+ 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
+ 'only_matching': True,
}]
@staticmethod
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
index ed38c77eb..e2202d603 100644
--- a/youtube_dl/extractor/r7.py
+++ b/youtube_dl/extractor/r7.py
@@ -62,8 +62,7 @@ class R7IE(InfoExtractor):
# m3u8 format always matches the http format, let's copy metadata from
# one to another
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- formats))
+ lambda f: f.get('vcodec') != 'none', formats))
if len(m3u8_formats) == 1:
f_copy = m3u8_formats[0].copy()
f_copy.update(f)
diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py
index 2340dae53..e921ca3e6 100644
--- a/youtube_dl/extractor/rmcdecouverte.py
+++ b/youtube_dl/extractor/rmcdecouverte.py
@@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor):
_VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)'
_TEST = {
- 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE',
+ 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET',
'info_dict': {
- 'id': '5111223049001',
+ 'id': '5419055995001',
'ext': 'mp4',
- 'title': ': LES HEROS DU 88e ETAGE',
- 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.',
+ 'title': 'UN DELICIEUX PROJET',
+ 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5',
'uploader_id': '1969646226001',
- 'upload_date': '20160904',
- 'timestamp': 1472951103,
+ 'upload_date': '20170502',
+ 'timestamp': 1493745308,
},
'params': {
- # rtmp download
'skip_download': True,
},
- 'skip': 'Only works from France',
+ 'skip': 'only available for a week',
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
@@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0]
- return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+ if brightcove_legacy_url:
+ brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
+ brightcove_legacy_url).query)['@videoPlayer'][0]
+ else:
+ brightcove_id = self._search_regex(
+ r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew',
+ brightcove_id)
diff --git a/youtube_dl/extractor/streamcz.py b/youtube_dl/extractor/streamcz.py
index 9e533103c..58e0b4c80 100644
--- a/youtube_dl/extractor/streamcz.py
+++ b/youtube_dl/extractor/streamcz.py
@@ -26,7 +26,7 @@ class StreamCZIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
- 'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',
+ 'md5': '934bb6a6d220d99c010783c9719960d5',
'info_dict': {
'id': '765767',
'ext': 'mp4',
@@ -37,7 +37,7 @@ class StreamCZIE(InfoExtractor):
},
}, {
'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
- 'md5': 'e54a254fb8b871968fd8403255f28589',
+ 'md5': '849a88c1e1ca47d41403c2ba5e59e261',
'info_dict': {
'id': '10002447',
'ext': 'mp4',
@@ -85,6 +85,14 @@ class StreamCZIE(InfoExtractor):
else:
title = data['name']
+ subtitles = {}
+ srt_url = data.get('subtitles_srt')
+ if srt_url:
+ subtitles['cs'] = [{
+ 'ext': 'srt',
+ 'url': srt_url,
+ }]
+
return {
'id': video_id,
'title': title,
@@ -93,4 +101,5 @@ class StreamCZIE(InfoExtractor):
'description': data.get('web_site_text'),
'duration': int_or_none(data.get('duration')),
'view_count': int_or_none(data.get('views')),
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 1b1afab32..3f3c681ae 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -210,7 +210,7 @@ class TEDIE(InfoExtractor):
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 9a424b1c6..de236bbba 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE):
'url': src,
})
+ duration = info.get('duration')
+ tp_chapters = info.get('chapters', [])
+ chapters = []
+ if tp_chapters:
+ def _add_chapter(start_time, end_time):
+ start_time = float_or_none(start_time, 1000)
+ end_time = float_or_none(end_time, 1000)
+ if start_time is None or end_time is None:
+ return
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ })
+
+ for chapter in tp_chapters[:-1]:
+ _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
+ _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+
return {
'title': info['title'],
'subtitles': subtitles,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
- 'duration': int_or_none(info.get('duration'), 1000),
+ 'duration': float_or_none(duration, 1000),
'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'),
+ 'chapters': chapters,
}
def _extract_theplatform_metadata(self, path, video_id):
diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py
index b8504f0eb..cd642355c 100644
--- a/youtube_dl/extractor/thescene.py
+++ b/youtube_dl/extractor/thescene.py
@@ -3,10 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urlparse
-from ..utils import (
- int_or_none,
- qualities,
-)
class TheSceneIE(InfoExtractor):
@@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor):
'season': 'Ready To Wear Spring 2013',
'tags': list,
'categories': list,
+ 'upload_date': '20120913',
+ 'timestamp': 1347512400,
+ 'uploader': 'vogue',
},
}
@@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor):
self._html_search_regex(
r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url'))
- player = self._download_webpage(player_url, display_id)
- info = self._parse_json(
- self._search_regex(
- r'(?m)video\s*:\s*({.+?}),$', player, 'info json'),
- display_id)
-
- video_id = info['id']
- title = info['title']
-
- qualities_order = qualities(('low', 'high'))
- formats = [{
- 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']),
- 'url': f['src'],
- 'quality': qualities_order(f['quality']),
- } for f in info['sources']]
- self._sort_formats(formats)
-
return {
- 'id': video_id,
+ '_type': 'url_transparent',
'display_id': display_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': info.get('poster_frame'),
- 'duration': int_or_none(info.get('duration')),
- 'series': info.get('series_title'),
- 'season': info.get('season_title'),
- 'tags': info.get('tags'),
- 'categories': info.get('categories'),
+ 'url': player_url,
+ 'ie_key': 'CondeNast',
}
diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py
index c54b876d3..348d6ecdf 100644
--- a/youtube_dl/extractor/toggle.py
+++ b/youtube_dl/extractor/toggle.py
@@ -17,7 +17,7 @@ from ..utils import (
class ToggleIE(InfoExtractor):
IE_NAME = 'toggle'
- _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:series|clips|movies)/(?:[^/]+/)+(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',
'info_dict': {
@@ -73,6 +73,12 @@ class ToggleIE(InfoExtractor):
}, {
'url': 'http://video.toggle.sg/en/movies/seven-days/321936',
'only_matching': True,
+ }, {
+ 'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585',
+ 'only_matching': True,
}]
_FORMAT_PREFERENCES = {
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py
index 938e05076..f705a06c9 100644
--- a/youtube_dl/extractor/toypics.py
+++ b/youtube_dl/extractor/toypics.py
@@ -6,42 +6,48 @@ import re
class ToypicsIE(InfoExtractor):
- IE_DESC = 'Toypics user profile'
- _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
+ IE_DESC = 'Toypics video'
+ _VALID_URL = r'https?://videos\.toypics\.net/view/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
'md5': '16e806ad6d6f58079d210fe30985e08b',
'info_dict': {
'id': '514',
'ext': 'mp4',
- 'title': 'Chance-Bulge\'d, 2',
+ 'title': "Chance-Bulge'd, 2",
'age_limit': 18,
'uploader': 'kidsune',
}
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- page = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
- title = self._html_search_regex(
- r'<title>Toypics - ([^<]+)</title>', page, 'title')
- username = self._html_search_regex(
- r'toypics.net/([^/"]+)" class="user-name">', page, 'username')
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ formats = self._parse_html5_media_entries(
+ url, webpage, video_id)[0]['formats']
+ title = self._html_search_regex([
+ r'<h1[^>]+class=["\']view-video-title[^>]+>([^<]+)</h',
+ r'<title>([^<]+) - Toypics</title>',
+ ], webpage, 'title')
+
+ uploader = self._html_search_regex(
+ r'More videos from <strong>([^<]+)</strong>', webpage, 'uploader',
+ fatal=False)
+
return {
'id': video_id,
- 'url': video_url,
+ 'formats': formats,
'title': title,
- 'uploader': username,
+ 'uploader': uploader,
'age_limit': 18,
}
class ToypicsUserIE(InfoExtractor):
IE_DESC = 'Toypics user profile'
- _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+ _VALID_URL = r'https?://videos\.toypics\.net/(?!view)(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://videos.toypics.net/Mikey',
'info_dict': {
@@ -51,8 +57,7 @@ class ToypicsUserIE(InfoExtractor):
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- username = mobj.group('username')
+ username = self._match_id(url)
profile_page = self._download_webpage(
url, username, note='Retrieving profile page')
@@ -71,7 +76,7 @@ class ToypicsUserIE(InfoExtractor):
note='Downloading page %d/%d' % (n, page_count))
urls.extend(
re.findall(
- r'<p class="video-entry-title">\s+<a href="(https?://videos.toypics.net/view/[^"]+)">',
+ r'<div[^>]+class=["\']preview[^>]+>\s*<a[^>]+href="(https?://videos\.toypics\.net/view/[^"]+)"',
lpage))
return {
diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py
index 1c0be9fc6..efeb677ee 100644
--- a/youtube_dl/extractor/turner.py
+++ b/youtube_dl/extractor/turner.py
@@ -13,6 +13,7 @@ from ..utils import (
xpath_attr,
update_url_query,
ExtractorError,
+ strip_or_none,
)
@@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE):
'height': int_or_none(image.get('height')),
} for image in video_data.findall('images/image')]
+ is_live = xpath_text(video_data, 'isLive') == 'true'
+
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
- 'description': xpath_text(video_data, 'description'),
+ 'thumbnail': xpath_text(video_data, 'poster'),
+ 'description': strip_or_none(xpath_text(video_data, 'description')),
'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')),
'timestamp': self._extract_timestamp(video_data),
'upload_date': xpath_attr(video_data, 'metas', 'version'),
'series': xpath_text(video_data, 'showTitle'),
'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index 06ea2b40a..c5b3288ad 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor):
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
formats.extend(m3u8_formats)
for i, m3u8_format in enumerate(m3u8_formats, 2):
http_url = '%s-%d.mp4' % (video_url_base, i)
diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py
index b6537141a..ebde6053f 100644
--- a/youtube_dl/extractor/tvplayer.py
+++ b/youtube_dl/extractor/tvplayer.py
@@ -2,9 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
extract_attributes,
+ try_get,
urlencode_postdata,
ExtractorError,
)
@@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):
webpage, 'channel element'))
title = current_channel['data-name']
- resource_id = self._search_regex(
- r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
- platform = self._search_regex(
- r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
+ resource_id = current_channel['data-id']
+
token = self._search_regex(
- r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
- validate = self._search_regex(
- r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
+ r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+ 'token', group='token')
+
+ context = self._download_json(
+ 'https://tvplayer.com/watch/context', display_id,
+ 'Downloading JSON context', query={
+ 'resource': resource_id,
+ 'nonce': token,
+ })
+
+ validate = context['validate']
+ platform = try_get(
+ context, lambda x: x['platform']['key'], compat_str) or 'firefox'
try:
response = self._download_json(
'http://api.tvplayer.com/api/v2/stream/live',
- resource_id, headers={
+ display_id, 'Downloading JSON stream', headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}, data=urlencode_postdata({
+ 'id': resource_id,
'service': 1,
'platform': platform,
- 'id': resource_id,
- 'token': token,
'validate': validate,
}))['tvplayer']['response']
except ExtractorError as e:
@@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
raise
- formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
+ formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py
new file mode 100644
index 000000000..30297b4dd
--- /dev/null
+++ b/youtube_dl/extractor/upskill.py
@@ -0,0 +1,176 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .wistia import WistiaIE
+from ..compat import compat_str
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ get_element_by_class,
+ urlencode_postdata,
+ urljoin,
+)
+
+
+class UpskillBaseIE(InfoExtractor):
+ _LOGIN_URL = 'http://upskillcourses.com/sign_in'
+ _NETRC_MACHINE = 'upskill'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_page, urlh = self._download_webpage_handle(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_url = compat_str(urlh.geturl())
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'user[email]': username,
+ 'user[password]': password,
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page,
+ 'post url', default=login_url, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = urljoin(login_url, post_url)
+
+ response = self._download_webpage(
+ post_url, None, 'Logging in',
+ data=urlencode_postdata(login_form),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': login_url,
+ })
+
+ # Successful login
+ if any(re.search(p, response) for p in (
+ r'class=["\']user-signout',
+ r'<a[^>]+\bhref=["\']/sign_out',
+ r'>\s*Log out\s*<')):
+ return
+
+ message = get_element_by_class('alert', response)
+ if message is not None:
+ raise ExtractorError(
+ 'Unable to login: %s' % clean_html(message), expected=True)
+
+ raise ExtractorError('Unable to log in')
+
+
+class UpskillIE(UpskillBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+ 'info_dict': {
+ 'id': 'uzw6zw58or',
+ 'ext': 'mp4',
+ 'title': 'Welcome to the Course!',
+ 'description': 'md5:8d66c13403783370af62ca97a7357bdd',
+ 'duration': 138.763,
+ 'timestamp': 1479846621,
+ 'upload_date': '20161122',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ wistia_url = WistiaIE._extract_url(webpage)
+ if not wistia_url:
+ if any(re.search(p, webpage) for p in (
+ r'class=["\']lecture-contents-locked',
+ r'>\s*Lecture contents locked',
+ r'id=["\']lecture-locked')):
+ self.raise_login_required('Lecture contents locked')
+
+ title = self._og_search_title(webpage, default=None)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': wistia_url,
+ 'ie_key': WistiaIE.ie_key(),
+ 'title': title,
+ }
+
+
+class UpskillCourseIE(UpskillBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/',
+ 'info_dict': {
+ 'id': '119763',
+ 'title': 'The Essential Web Developer Course (Free)',
+ },
+ 'playlist_count': 192,
+ }, {
+ 'url': 'http://upskillcourses.com/courses/119763/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://upskillcourses.com/courses/enrolled/119763',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if UpskillIE.suitable(url) else super(
+ UpskillCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, course_id)
+
+ course_id = self._search_regex(
+ r'data-course-id=["\'](\d+)', webpage, 'course id',
+ default=course_id)
+
+ entries = []
+
+ for mobj in re.finditer(
+ r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)',
+ webpage):
+ li = mobj.group('li')
+ if 'fa-youtube-play' not in li:
+ continue
+ lecture_url = self._search_regex(
+ r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li,
+ 'lecture url', default=None, group='url')
+ if not lecture_url:
+ continue
+ lecture_id = self._search_regex(
+ r'/lectures/(\d+)', lecture_url, 'lecture id', default=None)
+ title = self._html_search_regex(
+ r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li,
+ 'title', default=None)
+ entries.append(
+ self.url_result(
+ urljoin('http://upskillcourses.com/', lecture_url),
+ ie=UpskillIE.ie_key(), video_id=lecture_id,
+ video_title=clean_html(title)))
+
+ course_title = self._html_search_regex(
+ (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h',
+ r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'),
+ webpage, 'course title', fatal=False)
+
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 9aa38bc5a..890a149ea 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
from ..compat import (
@@ -11,7 +12,6 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
- sanitized_Request,
parse_iso8601,
)
@@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE):
}
def _initialize_api(self, video_id):
- req = sanitized_Request(
- 'http://www.vevo.com/auth', data=b'')
webpage = self._download_webpage(
- req, None,
+ 'https://accounts.vevo.com/token', None,
note='Retrieving oauth token',
- errnote='Unable to retrieve oauth token')
+ errnote='Unable to retrieve oauth token',
+ data=json.dumps({
+ 'client_id': 'SPupX1tvqFEopQ1YS6SS',
+ 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
self.raise_geo_restricted(
'%s said: This page is currently unavailable in your region' % self.IE_NAME)
auth_info = self._parse_json(webpage, video_id)
- self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
+ self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
def _call_api(self, path, *args, **kwargs):
try:
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index f0a7fd739..54e207b39 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -20,7 +20,7 @@ from ..utils import (
class ViceBaseIE(AdobePassIE):
- def _extract_preplay_video(self, url, webpage):
+ def _extract_preplay_video(self, url, locale, webpage):
watch_hub_data = extract_attributes(self._search_regex(
r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))
video_id = watch_hub_data['vms-id']
@@ -32,7 +32,8 @@ class ViceBaseIE(AdobePassIE):
resource = self._get_mvpd_resource(
'VICELAND', title, video_id,
watch_hub_data.get('video-rating'))
- query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource)
+ query['tvetoken'] = self._extract_mvpd_auth(
+ url, video_id, 'VICELAND', resource)
# signature generation algorithm is reverse engineered from signatureGenerator in
# webpack:///../shared/~/vice-player/dist/js/vice-player.js in
@@ -45,11 +46,14 @@ class ViceBaseIE(AdobePassIE):
try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST
- preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query)
+ preplay = self._download_json(
+ 'https://%s.com/%s/preplay/%s' % (host, locale, video_id),
+ video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
error = json.loads(e.cause.read().decode())
- raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True)
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, error['details']), expected=True)
raise
video_data = preplay['video']
@@ -88,41 +92,30 @@ class ViceBaseIE(AdobePassIE):
class ViceIE(ViceBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+ IE_NAME = 'vice'
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
- 'md5': 'e9d77741f9e42ba583e683cd170660f7',
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',
'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj',
'ext': 'flv',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- 'duration': 725.983,
+ 'title': 'Monkey Labs of Holland',
+ 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149',
},
'add_ie': ['Ooyala'],
}, {
- 'url': 'http://www.vice.com/video/how-to-hack-a-car',
- 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
- 'info_dict': {
- 'id': '3jstaBeXgAs',
- 'ext': 'mp4',
- 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
- 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
- 'uploader_id': 'MotherboardTV',
- 'uploader': 'Motherboard',
- 'upload_date': '20140529',
- },
- 'add_ie': ['Youtube'],
- }, {
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
- 'md5': '',
'info_dict': {
'id': '5816510690b70e6c5fd39a56',
'ext': 'mp4',
'uploader': 'Waypoint',
'title': 'The Signal From Tölva',
+ 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9',
- 'timestamp': 1477941983938,
+ 'timestamp': 1477941983,
+ 'upload_date': '20161031',
},
'params': {
# m3u8 download
@@ -130,19 +123,31 @@ class ViceIE(ViceBaseIE):
},
'add_ie': ['UplynkPreplay'],
}, {
- 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
- 'only_matching': True,
- }, {
- 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
- 'only_matching': True,
+ 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
+ 'info_dict': {
+ 'id': '581b12b60a0e1f4c0fb6ea2f',
+ 'ext': 'mp4',
+ 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
+ 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
+ 'uploader': 'VICE',
+ 'uploader_id': '57a204088cb727dec794c67b',
+ 'timestamp': 1485368119,
+ 'upload_date': '20170125',
+ 'age_limit': 14,
+ },
+ 'params': {
+ # AES-encrypted m3u8
+ 'skip_download': True,
+ },
+ 'add_ie': ['UplynkPreplay'],
}, {
- 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+ 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True,
}]
_PREPLAY_HOST = 'video.vice'
def _real_extract(self, url):
- video_id = self._match_id(url)
+ locale, video_id = re.match(self._VALID_URL, url).groups()
webpage, urlh = self._download_webpage_handle(url, video_id)
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
@@ -153,10 +158,11 @@ class ViceIE(ViceBaseIE):
r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None)
if youtube_id:
return self.url_result(youtube_id, 'Youtube')
- return self._extract_preplay_video(urlh.geturl(), webpage)
+ return self._extract_preplay_video(urlh.geturl(), locale, webpage)
class ViceShowIE(InfoExtractor):
+ IE_NAME = 'vice:show'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
_TEST = {
@@ -183,6 +189,86 @@ class ViceShowIE(InfoExtractor):
r'<title>(.+?)</title>', webpage, 'title', default=None)
if title:
title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
- description = self._html_search_meta('description', webpage, 'description')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
return self.playlist_result(entries, show_id, title, description)
+
+
+class ViceArticleIE(InfoExtractor):
+ IE_NAME = 'vice:article'
+ _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
+ 'info_dict': {
+ 'id': '58dc0a3dee202d2a0ccfcbd8',
+ 'ext': 'mp4',
+ 'title': 'Mormon War on Porn ',
+ 'description': 'md5:ad396a2481e7f8afb5ed486878421090',
+ 'uploader': 'VICE',
+ 'uploader_id': '57a204088cb727dec794c693',
+ 'timestamp': 1489160690,
+ 'upload_date': '20170310',
+ },
+ 'params': {
+ # AES-encrypted m3u8
+ 'skip_download': True,
+ },
+ 'add_ie': ['UplynkPreplay'],
+ }, {
+ 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
+ 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
+ 'info_dict': {
+ 'id': '3jstaBeXgAs',
+ 'ext': 'mp4',
+ 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
+ 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
+ 'uploader_id': 'MotherboardTV',
+ 'uploader': 'Motherboard',
+ 'upload_date': '20140529',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ prefetch_data = self._parse_json(self._search_regex(
+ r'window\.__PREFETCH_DATA\s*=\s*({.*});',
+ webpage, 'prefetch data'), display_id)
+ body = prefetch_data['body']
+
+ def _url_res(video_url, ie_key):
+ return {
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'display_id': display_id,
+ 'ie_key': ie_key,
+ }
+
+ embed_code = self._search_regex(
+ r'embedCode=([^&\'"]+)', body,
+ 'ooyala embed code', default=None)
+ if embed_code:
+ return _url_res('ooyala:%s' % embed_code, 'Ooyala')
+
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="(.*youtube\.com/.*)"',
+ body, 'YouTube URL', default=None)
+ if youtube_url:
+ return _url_res(youtube_url, 'Youtube')
+
+ video_url = self._html_search_regex(
+ r'data-video-url="([^"]+)"',
+ prefetch_data['embed_code'], 'video URL')
+
+ return _url_res(video_url, ViceIE.ie_key())
diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py
index 87f9216b5..bd60235c8 100644
--- a/youtube_dl/extractor/viceland.py
+++ b/youtube_dl/extractor/viceland.py
@@ -1,11 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .vice import ViceBaseIE
class VicelandIE(ViceBaseIE):
- _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'
_TEST = {
'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',
'info_dict': {
@@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE):
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
+ 'skip': '404',
}
_PREPLAY_HOST = 'www.viceland'
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ locale = mobj.group('locale')
webpage = self._download_webpage(url, video_id)
- return self._extract_preplay_video(url, webpage)
+ return self._extract_preplay_video(url, locale, webpage)
diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py
index 049db25a5..e5f964d39 100644
--- a/youtube_dl/extractor/videopress.py
+++ b/youtube_dl/extractor/videopress.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import random
import re
from .common import InfoExtractor
@@ -11,6 +10,7 @@ from ..utils import (
float_or_none,
parse_age_limit,
qualities,
+ random_birthday,
try_get,
unified_timestamp,
urljoin,
@@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ query = random_birthday('birth_year', 'birth_month', 'birth_day')
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
- video_id, query={
- 'birth_month': random.randint(1, 12),
- 'birth_day': random.randint(1, 31),
- 'birth_year': random.randint(1950, 1995),
- })
+ video_id, query=query)
title = video['title']
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 5ef7635b6..3e67eb8c2 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -5,24 +5,30 @@ import re
import itertools
from .common import InfoExtractor
+from ..utils import (
+ urlencode_postdata,
+ int_or_none,
+ unified_strdate,
+)
class VierIE(InfoExtractor):
IE_NAME = 'vier'
IE_DESC = 'vier.be and vijf.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _NETRC_MACHINE = 'vier'
_TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+ 'md5': 'e4ae2054a6b040ef1e289e20d111b46e',
'info_dict': {
'id': '16129',
'display_id': 'het-wordt-warm-de-moestuin',
'ext': 'mp4',
'title': 'Het wordt warm in De Moestuin',
'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'upload_date': '20121025',
+ 'series': 'Plan B',
+ 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'],
},
}, {
'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
@@ -30,32 +36,103 @@ class VierIE(InfoExtractor):
'id': '2561614',
'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
'ext': 'mp4',
- 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
- 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
+ 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',
+ 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',
+ 'upload_date': '20170228',
+ 'series': 'Temptation Island',
+ 'tags': list,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+ 'info_dict': {
+ 'id': '2674839',
+ 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+ 'ext': 'mp4',
+ 'title': 'Jani gaat naar Tokio - Aflevering 4',
+ 'description': 'md5:aa8d611541db6ae9e863125704511f88',
+ 'upload_date': '20170501',
+ 'series': 'Jani gaat',
+ 'episode_number': 4,
+ 'tags': ['Jani Gaat', 'Volledige Aflevering'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Requires account credentials',
+ }, {
+ # Requires account credentials but bypassed extraction via v3/embed page
+ # without metadata
+ 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+ 'info_dict': {
+ 'id': '2674839',
+ 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+ 'ext': 'mp4',
+ 'title': 'jani-gaat-naar-tokio-aflevering-4',
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'expected_warnings': ['Log in to extract metadata'],
}, {
- 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
+ # Without video id in URL
+ 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',
'only_matching': True,
}, {
'url': 'http://www.vier.be/video/v3/embed/16129',
'only_matching': True,
}]
+ def _real_initialize(self):
+ self._logged_in = False
+
+ def _login(self, site):
+ username, password = self._get_login_info()
+ if username is None or password is None:
+ return
+
+ login_page = self._download_webpage(
+ 'http://www.%s.be/user/login' % site,
+ None, note='Logging in', errnote='Unable to log in',
+ data=urlencode_postdata({
+ 'form_id': 'user_login',
+ 'name': username,
+ 'pass': password,
+ }),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+ login_error = self._html_search_regex(
+ r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
+ login_page, 'login error', default=None)
+ if login_error:
+ self.report_warning('Unable to log in: %s' % login_error)
+ else:
+ self._logged_in = True
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
embed_id = mobj.group('embed_id')
display_id = mobj.group('display_id') or embed_id
+ video_id = mobj.group('id') or embed_id
site = mobj.group('site')
+ if not self._logged_in:
+ self._login(site)
+
webpage = self._download_webpage(url, display_id)
+ if r'id="user-login"' in webpage:
+ self.report_warning(
+ 'Log in to extract metadata', video_id=display_id)
+ webpage = self._download_webpage(
+ 'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
+ display_id)
+
video_id = self._search_regex(
[r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
- webpage, 'video id')
+ webpage, 'video id', default=video_id or display_id)
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
webpage, 'application', default=site + '_vod')
@@ -64,12 +141,25 @@ class VierIE(InfoExtractor):
webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
- formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
+ formats = self._extract_wowza_formats(
+ playlist_url, display_id, skip_protocols=['dash'])
self._sort_formats(formats)
title = self._og_search_title(webpage, default=display_id)
- description = self._og_search_description(webpage, default=None)
+ description = self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>',
+ webpage, 'description', default=None, group='value')
thumbnail = self._og_search_thumbnail(webpage, default=None)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})',
+ webpage, 'upload date', default=None, group='value'))
+
+ series = self._search_regex(
+ r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'series', default=None, group='value')
+ episode_number = int_or_none(self._search_regex(
+ r'(?i)aflevering (\d+)', title, 'episode number', default=None))
+ tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage)
return {
'id': video_id,
@@ -77,6 +167,10 @@ class VierIE(InfoExtractor):
'title': title,
'description': description,
'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'series': series,
+ 'episode_number': episode_number,
+ 'tags': tags,
'formats': formats,
}
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index fcf0cb100..d5d5b4c69 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py
index 487047fd7..9959627c0 100644
--- a/youtube_dl/extractor/vrv.py
+++ b/youtube_dl/extractor/vrv.py
@@ -112,21 +112,41 @@ class VRVIE(VRVBaseIE):
audio_locale = streams_json.get('audio_locale')
formats = []
- for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items():
- stream_url = stream.get('url')
- if not stream_url:
- continue
- stream_id = stream_id or audio_locale
- m3u8_formats = self._extract_m3u8_formats(
- stream_url, video_id, 'mp4', m3u8_id=stream_id,
- note='Downloading %s m3u8 information' % stream_id,
- fatal=False)
- if audio_locale:
- for f in m3u8_formats:
- f['language'] = audio_locale
- formats.extend(m3u8_formats)
+ for stream_type, streams in streams_json.get('streams', {}).items():
+ if stream_type in ('adaptive_hls', 'adaptive_dash'):
+ for stream in streams.values():
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ stream_id = stream.get('hardsub_locale') or audio_locale
+ format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
+ if stream_type == 'adaptive_hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ stream_url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s m3u8 information' % stream_id,
+ fatal=False)
+ else:
+ adaptive_formats = self._extract_mpd_formats(
+ stream_url, video_id, mpd_id=format_id,
+ note='Downloading %s MPD information' % stream_id,
+ fatal=False)
+ if audio_locale:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_locale
+ formats.extend(adaptive_formats)
self._sort_formats(formats)
+ subtitles = {}
+ for subtitle in streams_json.get('subtitles', {}).values():
+ subtitle_url = subtitle.get('url')
+ if not subtitle_url:
+ continue
+ subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
+ 'url': subtitle_url,
+ 'ext': subtitle.get('format', 'ass'),
+ })
+
thumbnails = []
for thumbnail in video_data.get('images', {}).get('thumbnails', []):
thumbnail_url = thumbnail.get('source')
@@ -142,6 +162,7 @@ class VRVIE(VRVBaseIE):
'id': video_id,
'title': title,
'formats': formats,
+ 'subtitles': subtitles,
'thumbnails': thumbnails,
'description': video_data.get('description'),
'duration': float_or_none(video_data.get('duration_ms'), 1000),
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 839cad986..625d0a1cc 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -13,6 +13,7 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
IE_NAME = 'washingtonpost'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_TEST = {
'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor):
},
}
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return re.findall(
+ r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index c634b8dec..2182d6fd4 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,10 +1,13 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
+ unescapeHTML,
)
@@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_url(webpage):
+ match = re.search(
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ if match:
+ return unescapeHTML(match.group('url'))
+
+ match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
+ match = re.search(
+ r'''(?sx)
+ <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+ <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+ ''', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 5584674a0..bea9b87ad 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ js_to_json,
orderedSet,
parse_duration,
sanitized_Request,
@@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
}, {
+ # FLV videos with duplicated formats
+ 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+ 'md5': 'a406963eb349dd43692ec54631efd88b',
+ 'info_dict': {
+ 'id': '9299752',
+ 'display_id': 'A-Super-Run-Part-1-YT',
+ 'ext': 'flv',
+ 'title': 'A Super Run - Part 1 (YT)',
+ 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+ 'uploader': 'tshirtguy59',
+ 'duration': 579,
+ 'view_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ },
+ }, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
'only_matching': True,
@@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):
})
sources = self._parse_json(self._search_regex(
- r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
- webpage, 'sources', group='sources'), video_id)
+ r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+ webpage, 'sources', group='sources'), video_id,
+ transform_source=js_to_json)
formats = []
for format_id, format_url in sources.items():
@@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):
'format_id': format_id,
'height': int_or_none(format_id),
})
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
title = self._search_regex(
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 30825daae..eca603028 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -6,8 +6,10 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
clean_html,
- ExtractorError,
determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_duration,
)
@@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor):
'id': '4588838',
'ext': 'mp4',
'title': 'Biker Takes his Girl',
+ 'duration': 108,
'age_limit': 18,
}
}
@@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor):
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ video_duration = int_or_none(self._og_search_property(
+ 'duration', webpage, default=None)) or parse_duration(
+ self._search_regex(
+ r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+ webpage, 'duration', fatal=False))
formats = []
@@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor):
'id': video_id,
'formats': formats,
'title': video_title,
+ 'duration': video_duration,
'thumbnail': video_thumbnail,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index fd6268ba4..eb1062142 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'overembed': 'false',
})['playlist']
- tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+ tracks = playlist['tracks']
+ track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
# tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
# missing tracks should be retrieved manually.
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 9e2b9115c..d66693c0c 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -38,7 +38,6 @@ from ..utils import (
parse_duration,
remove_quotes,
remove_start,
- sanitized_Request,
smuggle_url,
str_to_int,
try_get,
@@ -54,7 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
- _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password'
+
+ _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+ _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+ _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
@@ -96,72 +99,150 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
login_form = self._hidden_inputs(login_page)
- login_form.update({
- 'checkConnection': 'youtube',
- 'Email': username,
- 'Passwd': password,
- })
+ def req(url, f_req, note, errnote):
+ data = login_form.copy()
+ data.update({
+ 'pstMsg': 1,
+ 'checkConnection': 'youtube',
+ 'checkedDomains': 'youtube',
+ 'hl': 'en',
+ 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
+ 'f.req': json.dumps(f_req),
+ 'flowName': 'GlifWebSignIn',
+ 'flowEntry': 'ServiceLogin',
+ })
+ return self._download_json(
+ url, None, note=note, errnote=errnote,
+ transform_source=lambda s: re.sub(r'^[^[]*', '', s),
+ fatal=False,
+ data=urlencode_postdata(data), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
+ 'Google-Accounts-XSRF': 1,
+ })
- login_results = self._download_webpage(
- self._PASSWORD_CHALLENGE_URL, None,
- note='Logging in', errnote='unable to log in', fatal=False,
- data=urlencode_postdata(login_form))
- if login_results is False:
- return False
+ def warn(message):
+ self._downloader.report_warning(message)
+
+ lookup_req = [
+ username,
+ None, [], None, 'US', None, None, 2, False, True,
+ [
+ None, None,
+ [2, 1, None, 1,
+ 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+ None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ],
+ username,
+ ]
- error_msg = self._html_search_regex(
- r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',
- login_results, 'error message', default=None)
- if error_msg:
- raise ExtractorError('Unable to login: %s' % error_msg, expected=True)
+ lookup_results = req(
+ self._LOOKUP_URL, lookup_req,
+ 'Looking up account info', 'Unable to look up account info')
- if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
- raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
+ if lookup_results is False:
+ return False
- # Two-Factor
- # TODO add SMS and phone call support - these require making a request and then prompting the user
+ user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+ if not user_hash:
+ warn('Unable to extract user hash')
+ return False
- if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None:
- tfa_code = self._get_tfa_info('2-step verification code')
+ challenge_req = [
+ user_hash,
+ None, 1, None, [1, None, None, None, [password, None, True]],
+ [
+ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ]]
- if not tfa_code:
- self._downloader.report_warning(
- 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
- '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
- return False
+ challenge_results = req(
+ self._CHALLENGE_URL, challenge_req,
+ 'Logging in', 'Unable to log in')
- tfa_code = remove_start(tfa_code, 'G-')
+ if challenge_results is False:
+ return
- tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+ login_res = try_get(challenge_results, lambda x: x[0][5], list)
+ if login_res:
+ login_msg = try_get(login_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to login: %s' % 'Invalid password'
+ if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+ return False
- tfa_form_strs.update({
- 'Pin': tfa_code,
- 'TrustDevice': 'on',
- })
+ res = try_get(challenge_results, lambda x: x[0][-1], list)
+ if not res:
+ warn('Unable to extract result entry')
+ return False
- tfa_data = urlencode_postdata(tfa_form_strs)
+ tfa = try_get(res, lambda x: x[0][0], list)
+ if tfa:
+ tfa_str = try_get(tfa, lambda x: x[2], compat_str)
+ if tfa_str == 'TWO_STEP_VERIFICATION':
+ # SEND_SUCCESS - TFA code has been successfully sent to phone
+ # QUOTA_EXCEEDED - reached the limit of TFA codes
+ status = try_get(tfa, lambda x: x[5], compat_str)
+ if status == 'QUOTA_EXCEEDED':
+ warn('Exceeded the limit of TFA codes, try later')
+ return False
+
+ tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+ if not tl:
+ warn('Unable to extract TL')
+ return False
+
+ tfa_code = self._get_tfa_info('2-step verification code')
+
+ if not tfa_code:
+ warn(
+ 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+ '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ return False
+
+ tfa_code = remove_start(tfa_code, 'G-')
+
+ tfa_req = [
+ user_hash, None, 2, None,
+ [
+ 9, None, None, None, None, None, None, None,
+ [None, tfa_code, True, 2]
+ ]]
+
+ tfa_results = req(
+ self._TFA_URL.format(tl), tfa_req,
+ 'Submitting TFA code', 'Unable to submit TFA code')
+
+ if tfa_results is False:
+ return False
+
+ tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+ if tfa_res:
+ tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to finish TFA: %s' % 'Invalid TFA code'
+ if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+ return False
+
+ check_cookie_url = try_get(
+ tfa_results, lambda x: x[0][-1][2], compat_str)
+ else:
+ check_cookie_url = try_get(res, lambda x: x[2], compat_str)
- tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
- tfa_results = self._download_webpage(
- tfa_req, None,
- note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
+ if not check_cookie_url:
+ warn('Unable to extract CheckCookie URL')
+ return False
- if tfa_results is False:
- return False
+ check_cookie_results = self._download_webpage(
+ check_cookie_url, None, 'Checking cookie', fatal=False)
- if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None:
- self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
- return False
- if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None:
- self._downloader.report_warning('unable to log in - did the page structure change?')
- return False
- if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
- self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
- return False
+ if check_cookie_results is False:
+ return False
- if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning('unable to log in: bad username or password')
+ if 'https://myaccount.google.com/' not in check_cookie_results:
+ warn('Unable to log in')
return False
+
return True
def _real_initialize(self):
@@ -963,7 +1044,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -1257,6 +1338,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+ @staticmethod
+ def _extract_chapters(description, duration):
+ if not description:
+ return None
+ chapter_lines = re.findall(
+ r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
+ description)
+ if not chapter_lines:
+ return None
+ chapters = []
+ for next_num, (chapter_line, time_point) in enumerate(
+ chapter_lines, start=1):
+ start_time = parse_duration(time_point)
+ if start_time is None:
+ continue
+ end_time = (duration if next_num == len(chapter_lines)
+ else parse_duration(chapter_lines[next_num][1]))
+ if end_time is None:
+ continue
+ chapter_title = re.sub(
+ r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
+ chapter_title = re.sub(r'\s+', ' ', chapter_title)
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': chapter_title,
+ })
+ return chapters
+
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -1325,6 +1435,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
age_gate = False
video_info = None
+ sts = None
# Try looking directly into the video webpage
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
if ytplayer_config:
@@ -1341,6 +1452,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
if args.get('livestream') == '1' or args.get('live_playback') == 1:
is_live = True
+ sts = ytplayer_config.get('sts')
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
# We also try looking in get_video_info since it may contain different dashmpd
# URL that points to a DASH manifest with possibly different itag set (some itags
@@ -1349,14 +1461,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# The general idea is to take a union of itags of both DASH manifests (for example
# video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093)
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = (
- '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
- % (proto, video_id, el_type))
+ for el in ('info', 'embedded', 'detailpage', 'vevo', ''):
+ query = {
+ 'video_id': video_id,
+ 'ps': 'default',
+ 'eurl': '',
+ 'gl': 'US',
+ 'hl': 'en',
+ }
+ if el:
+ query['el'] = el
+ if sts:
+ query['sts'] = sts
video_info_webpage = self._download_webpage(
- video_info_url,
+ '%s://www.youtube.com/get_video_info' % proto,
video_id, note=False,
- errnote='unable to download video info webpage')
+ errnote='unable to download video info webpage',
+ fatal=False, query=query)
+ if not video_info_webpage:
+ continue
get_video_info = compat_parse_qs(video_info_webpage)
if get_video_info.get('use_cipher_signature') != ['True']:
add_dash_mpd(get_video_info)
@@ -1399,9 +1522,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_title = '_'
# description
- video_description = get_element_by_id("eow-description", video_webpage)
+ description_original = video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
- video_description = re.sub(r'''(?x)
+ description_original = video_description = re.sub(r'''(?x)
<a\s+
(?:[a-zA-Z-]+="[^"]*"\s+)*?
(?:title|href)="([^"]+)"\s+
@@ -1558,6 +1681,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
+ chapters = self._extract_chapters(description_original, video_duration)
+
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
formats = [{
@@ -1629,7 +1754,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1789,6 +1915,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
+ 'chapters': chapters,
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
'view_count': view_count,
'like_count': like_count,
diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py
new file mode 100644
index 000000000..889aff5d8
--- /dev/null
+++ b/youtube_dl/extractor/zaq1.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://zaq1.pl/video/xev0e',
+ 'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+ 'info_dict': {
+ 'id': 'xev0e',
+ 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+ 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+ 'ext': 'mp4',
+ 'duration': 511,
+ 'timestamp': 1490896361,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170330',
+ 'view_count': int,
+ }
+ }, {
+ # malformed JSON-LD
+ 'url': 'http://zaq1.pl/video/x81vn',
+ 'info_dict': {
+ 'id': 'x81vn',
+ 'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+ 'ext': 'mp4',
+ 'duration': 6234,
+ 'timestamp': 1493494860,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170429',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'video url', group='url')
+
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+
+ def extract_data(field, name, fatal=False):
+ return self._search_regex(
+ r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+ webpage, field, fatal=fatal, group='field')
+
+ if not info.get('title'):
+ info['title'] = extract_data('file-name', 'title', fatal=True)
+
+ if not info.get('duration'):
+ info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+ if not info.get('thumbnail'):
+ info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+ if not info.get('timestamp'):
+ info['timestamp'] = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp'))
+
+ if not info.get('interactionCount'):
+ info['view_count'] = int_or_none(self._html_search_meta(
+ 'interactionCount', webpage, 'view count'))
+
+ uploader = self._html_search_regex(
+ r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+ fatal=False)
+
+ width = int_or_none(self._html_search_meta(
+ 'width', webpage, fatal=False))
+ height = int_or_none(self._html_search_meta(
+ 'height', webpage, fatal=False))
+
+ info.update({
+ 'id': video_id,
+ 'formats': [{
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'Referer': url,
+ },
+ }],
+ 'uploader': uploader,
+ })
+
+ return info
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index 24cdec28c..7bda59610 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -6,6 +6,7 @@ import re
from .utils import (
ExtractorError,
+ remove_quotes,
)
_OPERATORS = [
@@ -57,7 +58,6 @@ class JSInterpreter(object):
def interpret_expression(self, expr, local_vars, allow_recursion):
expr = expr.strip()
-
if expr == '': # Empty expression
return None
@@ -121,11 +121,19 @@ class JSInterpreter(object):
pass
m = re.match(
- r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
+ r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
+ if m:
+ val = local_vars[m.group('in')]
+ idx = self.interpret_expression(
+ m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx]
+
+ m = re.match(
+ r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
expr)
if m:
variable = m.group('var')
- member = m.group('member')
+ member = remove_quotes(m.group('member') or m.group('member2'))
arg_str = m.group('args')
if variable in local_vars:
@@ -173,14 +181,6 @@ class JSInterpreter(object):
return obj[member](argvals)
- m = re.match(
- r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
- if m:
- val = local_vars[m.group('in')]
- idx = self.interpret_expression(
- m.group('idx'), local_vars, allow_recursion - 1)
- return val[idx]
-
for op, opfunc in _OPERATORS:
m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
if not m:
@@ -211,21 +211,25 @@ class JSInterpreter(object):
raise ExtractorError('Unsupported JS expression %r' % expr)
def extract_object(self, objname):
+ _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
obj = {}
obj_m = re.search(
- (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) +
- r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' +
- r'\}\s*;',
+ r'''(?x)
+ (?<!this\.)%s\s*=\s*{\s*
+ (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
+ }\s*;
+ ''' % (re.escape(objname), _FUNC_NAME_RE),
self.code)
fields = obj_m.group('fields')
# Currently, it only supports function definitions
fields_m = re.finditer(
- r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function'
- r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+ r'''(?x)
+ (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
+ ''' % _FUNC_NAME_RE,
fields)
for f in fields_m:
argnames = f.group('args').split(',')
- obj[f.group('key')] = self.build_function(argnames, f.group('code'))
+ obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
return obj
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 52309fb84..3021a6f41 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -814,11 +814,12 @@ def parseOpts(overrideArguments=None):
'--metadata-from-title',
metavar='FORMAT', dest='metafromtitle',
help='Parse additional metadata like song title / artist from the video title. '
- 'The format syntax is the same as --output, '
- 'the parsed parameters replace existing values. '
- 'Additional templates: %(album)s, %(artist)s. '
+ 'The format syntax is the same as --output. Regular expression with '
+ 'named capture groups may also be used. '
+ 'The parsed parameters replace existing values. '
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
- '"Coldplay - Paradise"')
+ '"Coldplay - Paradise". '
+ 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')
postproc.add_option(
'--xattrs',
action='store_true', dest='xattrs', default=False,
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 665109558..c91ec8588 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -4,6 +4,7 @@ import io
import os
import subprocess
import time
+import re
from .common import AudioConversionError, PostProcessor
@@ -22,6 +23,7 @@ from ..utils import (
subtitles_filename,
dfxp2srt,
ISO639Utils,
+ replace_extension,
)
@@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
+ in_filenames = [filename]
+ options = []
if info['ext'] == 'm4a':
- options = ['-vn', '-acodec', 'copy']
+ options.extend(['-vn', '-acodec', 'copy'])
else:
- options = ['-c', 'copy']
+ options.extend(['-c', 'copy'])
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
+ chapters = info.get('chapters', [])
+ if chapters:
+ metadata_filename = encodeFilename(replace_extension(filename, 'meta'))
+ with io.open(metadata_filename, 'wt', encoding='utf-8') as f:
+ def ffmpeg_escape(text):
+ return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text)
+
+ metadata_file_content = ';FFMETADATA1\n'
+ for chapter in chapters:
+ metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n'
+ metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000)
+ metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000)
+ chapter_title = chapter.get('title')
+ if chapter_title:
+ metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title)
+ f.write(metadata_file_content)
+ in_filenames.append(metadata_filename)
+ options.extend(['-map_metadata', '1'])
+
self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
- self.run_ffmpeg(filename, temp_filename, options)
+ self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options)
+ if chapters:
+ os.remove(metadata_filename)
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
return [], info
diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py
index a7d637a3c..c73f02447 100644
--- a/youtube_dl/postprocessor/metadatafromtitle.py
+++ b/youtube_dl/postprocessor/metadatafromtitle.py
@@ -9,7 +9,9 @@ class MetadataFromTitlePP(PostProcessor):
def __init__(self, downloader, titleformat):
super(MetadataFromTitlePP, self).__init__(downloader)
self._titleformat = titleformat
- self._titleregex = self.format_to_regex(titleformat)
+ self._titleregex = (self.format_to_regex(titleformat)
+ if re.search(r'%\(\w+\)s', titleformat)
+ else titleformat)
def format_to_regex(self, fmt):
r"""
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 91e235ff2..4293a77f5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -11,6 +11,7 @@ import contextlib
import ctypes
import datetime
import email.utils
+import email.header
import errno
import functools
import gzip
@@ -421,8 +422,8 @@ def clean_html(html):
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
- html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+ html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
@@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True):
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+ # Remove unrecognized timezones from ISO 8601 alike timestamps
+ m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+ if m:
+ date_str = date_str[:-len(m.group('tz'))]
+
for expression in date_formats(day_first):
try:
dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
@@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
return new_req
+def _multipart_encode_impl(data, boundary):
+ content_type = 'multipart/form-data; boundary=%s' % boundary
+
+ out = b''
+ for k, v in data.items():
+ out += b'--' + boundary.encode('ascii') + b'\r\n'
+ if isinstance(k, compat_str):
+ k = k.encode('utf-8')
+ if isinstance(v, compat_str):
+ v = v.encode('utf-8')
+ # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+ # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+ content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
+ if boundary.encode('ascii') in content:
+ raise ValueError('Boundary overlaps with data')
+ out += content
+
+ out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+ return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+ '''
+ Encode a dict to RFC 7578-compliant form-data
+
+ data:
+ A dict where keys and values can be either Unicode or bytes-like
+ objects.
+ boundary:
+ If specified a Unicode object, it's used as the boundary. Otherwise
+ a random boundary is generated.
+
+ Reference: https://tools.ietf.org/html/rfc7578
+ '''
+ has_specified_boundary = boundary is not None
+
+ while True:
+ if boundary is None:
+ boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+ try:
+ out, content_type = _multipart_encode_impl(data, boundary)
+ break
+ except ValueError:
+ if has_specified_boundary:
+ raise
+ boundary = None
+
+ return out, content_type
+
+
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys:
@@ -2273,10 +2331,8 @@ def mimetype2ext(mt):
return {
'3gpp': '3gp',
'smptett+xml': 'tt',
- 'srt': 'srt',
'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
- 'vtt': 'vtt',
'x-flv': 'flv',
'x-mp4-fragmented': 'mp4',
'x-ms-wmv': 'wmv',
@@ -2284,11 +2340,11 @@ def mimetype2ext(mt):
'x-mpegurl': 'm3u8',
'vnd.apple.mpegurl': 'm3u8',
'dash+xml': 'mpd',
- 'f4m': 'f4m',
'f4m+xml': 'f4m',
'hds+xml': 'f4m',
'vnd.ms-sstr+xml': 'ism',
'quicktime': 'mov',
+ 'mp2t': 'ts',
}.get(res, res)
@@ -2304,11 +2360,11 @@ def parse_codecs(codecs_str):
if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
if not vcodec:
vcodec = full_codec
- elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
+ elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
if not acodec:
acodec = full_codec
else:
- write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+ write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
if not vcodec and not acodec:
if len(splited_codecs) == 2:
return {
@@ -3757,3 +3813,11 @@ def write_xattr(path, key, value):
"Couldn't find a tool to set the xattrs. "
"Install either the python 'xattr' module, "
"or the 'xattr' binary.")
+
+
+def random_birthday(year_field, month_field, day_field):
+ return {
+ year_field: str(random.randint(1950, 1995)),
+ month_field: str(random.randint(1, 12)),
+ day_field: str(random.randint(1, 31)),
+ }
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index e206501e1..c006eba76 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2017.04.26'
+__version__ = '2017.05.23'