aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl
diff options
context:
space:
mode:
authorpukkandan <pukkandan.ytdlp@gmail.com>2022-01-30 01:07:28 +0530
committerpukkandan <pukkandan.ytdlp@gmail.com>2022-01-30 01:07:28 +0530
commita3373da70c97d356bd4927eff403abd261dd8f9f (patch)
treefb77c8f297d7ca1bd6a6f3e37a4c0a4b12399db5 /youtube_dl
parent2c4cb134a90b49a4d44965b57ff43cfd45ec2d69 (diff)
parent5014bd67c22b421207b2650d4dc874b95b36dda1 (diff)
downloadyoutube-dl-a3373da70c97d356bd4927eff403abd261dd8f9f.tar.xz
Merge branch 'UP/youtube-dl' into dl/YoutubeSearchURLIE
Diffstat (limited to 'youtube_dl')
-rwxr-xr-xyoutube_dl/YoutubeDL.py286
-rw-r--r--youtube_dl/__init__.py1
-rw-r--r--youtube_dl/compat.py10
-rw-r--r--youtube_dl/extractor/abcnews.py134
-rw-r--r--youtube_dl/extractor/adn.py198
-rw-r--r--youtube_dl/extractor/aenetworks.py4
-rw-r--r--youtube_dl/extractor/aljazeera.py41
-rw-r--r--youtube_dl/extractor/americastestkitchen.py97
-rw-r--r--youtube_dl/extractor/amp.py3
-rw-r--r--youtube_dl/extractor/animeondemand.py26
-rw-r--r--youtube_dl/extractor/aol.py12
-rw-r--r--youtube_dl/extractor/apa.py47
-rw-r--r--youtube_dl/extractor/appleconnect.py13
-rw-r--r--youtube_dl/extractor/applepodcasts.py1
-rw-r--r--youtube_dl/extractor/archiveorg.py54
-rw-r--r--youtube_dl/extractor/ard.py118
-rw-r--r--youtube_dl/extractor/arnes.py101
-rw-r--r--youtube_dl/extractor/awaan.py2
-rw-r--r--youtube_dl/extractor/azmedien.py2
-rw-r--r--youtube_dl/extractor/bandaichannel.py37
-rw-r--r--youtube_dl/extractor/bandcamp.py4
-rw-r--r--youtube_dl/extractor/bbc.py297
-rw-r--r--youtube_dl/extractor/bilibili.py3
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/blinkx.py86
-rw-r--r--youtube_dl/extractor/bravotv.py14
-rw-r--r--youtube_dl/extractor/canvas.py56
-rw-r--r--youtube_dl/extractor/cbs.py5
-rw-r--r--youtube_dl/extractor/cbsnews.py2
-rw-r--r--youtube_dl/extractor/cbssports.py123
-rw-r--r--youtube_dl/extractor/ccma.py68
-rw-r--r--youtube_dl/extractor/cda.py11
-rw-r--r--youtube_dl/extractor/comedycentral.py143
-rw-r--r--youtube_dl/extractor/common.py34
-rw-r--r--youtube_dl/extractor/cspan.py27
-rw-r--r--youtube_dl/extractor/curiositystream.py108
-rw-r--r--youtube_dl/extractor/dispeak.py46
-rw-r--r--youtube_dl/extractor/dplay.py159
-rw-r--r--youtube_dl/extractor/dreisat.py220
-rw-r--r--youtube_dl/extractor/egghead.py45
-rw-r--r--youtube_dl/extractor/eroprofile.py21
-rw-r--r--youtube_dl/extractor/extractors.py93
-rw-r--r--youtube_dl/extractor/facebook.py5
-rw-r--r--youtube_dl/extractor/formula1.py32
-rw-r--r--youtube_dl/extractor/franceculture.py20
-rw-r--r--youtube_dl/extractor/francetv.py7
-rw-r--r--youtube_dl/extractor/fujitv.py2
-rw-r--r--youtube_dl/extractor/funimation.py6
-rw-r--r--youtube_dl/extractor/gdcvault.py34
-rw-r--r--youtube_dl/extractor/gedidigital.py161
-rw-r--r--youtube_dl/extractor/generic.py88
-rw-r--r--youtube_dl/extractor/go.py46
-rw-r--r--youtube_dl/extractor/googledrive.py8
-rw-r--r--youtube_dl/extractor/ign.py371
-rw-r--r--youtube_dl/extractor/instagram.py29
-rw-r--r--youtube_dl/extractor/jamendo.py74
-rw-r--r--youtube_dl/extractor/kakao.py64
-rw-r--r--youtube_dl/extractor/kaltura.py14
-rw-r--r--youtube_dl/extractor/khanacademy.py137
-rw-r--r--youtube_dl/extractor/lbry.py90
-rw-r--r--youtube_dl/extractor/line.py142
-rw-r--r--youtube_dl/extractor/liveleak.py191
-rw-r--r--youtube_dl/extractor/maoritv.py31
-rw-r--r--youtube_dl/extractor/medaltv.py20
-rw-r--r--youtube_dl/extractor/medialaan.py307
-rw-r--r--youtube_dl/extractor/minds.py196
-rw-r--r--youtube_dl/extractor/mixcloud.py9
-rw-r--r--youtube_dl/extractor/mlb.py189
-rw-r--r--youtube_dl/extractor/mtv.py28
-rw-r--r--youtube_dl/extractor/ninecninemedia.py4
-rw-r--r--youtube_dl/extractor/ninegag.py188
-rw-r--r--youtube_dl/extractor/njpwworld.py54
-rw-r--r--youtube_dl/extractor/nrk.py2
-rw-r--r--youtube_dl/extractor/orf.py26
-rw-r--r--youtube_dl/extractor/palcomp3.py148
-rw-r--r--youtube_dl/extractor/peertube.py48
-rw-r--r--youtube_dl/extractor/periscope.py8
-rw-r--r--youtube_dl/extractor/phoenix.py154
-rw-r--r--youtube_dl/extractor/picarto.py100
-rw-r--r--youtube_dl/extractor/pinterest.py4
-rw-r--r--youtube_dl/extractor/playstuff.py65
-rw-r--r--youtube_dl/extractor/pluralsight.py2
-rw-r--r--youtube_dl/extractor/pornhub.py267
-rw-r--r--youtube_dl/extractor/rds.py18
-rw-r--r--youtube_dl/extractor/redbulltv.py6
-rw-r--r--youtube_dl/extractor/rts.py15
-rw-r--r--youtube_dl/extractor/rtve.py232
-rw-r--r--youtube_dl/extractor/samplefocus.py100
-rw-r--r--youtube_dl/extractor/sbs.py5
-rw-r--r--youtube_dl/extractor/screencastomatic.py48
-rw-r--r--youtube_dl/extractor/shahid.py28
-rw-r--r--youtube_dl/extractor/shared.py9
-rw-r--r--youtube_dl/extractor/simplecast.py160
-rw-r--r--youtube_dl/extractor/southpark.py16
-rw-r--r--youtube_dl/extractor/spike.py15
-rw-r--r--youtube_dl/extractor/sportdeutschland.py145
-rw-r--r--youtube_dl/extractor/spotify.py156
-rw-r--r--youtube_dl/extractor/srgssr.py208
-rw-r--r--youtube_dl/extractor/storyfire.py151
-rw-r--r--youtube_dl/extractor/stretchinternet.py21
-rw-r--r--youtube_dl/extractor/svt.py27
-rw-r--r--youtube_dl/extractor/ted.py24
-rw-r--r--youtube_dl/extractor/tf1.py123
-rw-r--r--youtube_dl/extractor/threeqsdn.py158
-rw-r--r--youtube_dl/extractor/tiktok.py7
-rw-r--r--youtube_dl/extractor/tmz.py97
-rw-r--r--youtube_dl/extractor/trovo.py194
-rw-r--r--youtube_dl/extractor/tv2.py82
-rw-r--r--youtube_dl/extractor/tv2dk.py17
-rw-r--r--youtube_dl/extractor/tv4.py6
-rw-r--r--youtube_dl/extractor/tver.py36
-rw-r--r--youtube_dl/extractor/twitch.py47
-rw-r--r--youtube_dl/extractor/twitter.py52
-rw-r--r--youtube_dl/extractor/umg.py8
-rw-r--r--youtube_dl/extractor/urplay.py17
-rw-r--r--youtube_dl/extractor/ustream.py2
-rw-r--r--youtube_dl/extractor/vgtv.py6
-rw-r--r--youtube_dl/extractor/videopress.py26
-rw-r--r--youtube_dl/extractor/vidio.py86
-rw-r--r--youtube_dl/extractor/vidzi.py68
-rw-r--r--youtube_dl/extractor/viki.py69
-rw-r--r--youtube_dl/extractor/vimeo.py304
-rw-r--r--youtube_dl/extractor/vk.py11
-rw-r--r--youtube_dl/extractor/vlive.py4
-rw-r--r--youtube_dl/extractor/voxmedia.py26
-rw-r--r--youtube_dl/extractor/vtm.py62
-rw-r--r--youtube_dl/extractor/vvvvid.py81
-rw-r--r--youtube_dl/extractor/wat.py129
-rw-r--r--youtube_dl/extractor/xboxclips.py43
-rw-r--r--youtube_dl/extractor/xfileshare.py4
-rw-r--r--youtube_dl/extractor/xhamster.py80
-rw-r--r--youtube_dl/extractor/xtube.py51
-rw-r--r--youtube_dl/extractor/yahoo.py80
-rw-r--r--youtube_dl/extractor/yandexmusic.py35
-rw-r--r--youtube_dl/extractor/youku.py2
-rw-r--r--youtube_dl/extractor/youporn.py102
-rw-r--r--youtube_dl/extractor/youtube.py2441
-rw-r--r--youtube_dl/extractor/zdf.py192
-rw-r--r--youtube_dl/extractor/zhihu.py69
-rw-r--r--youtube_dl/extractor/zingmp3.py208
-rw-r--r--youtube_dl/extractor/zoom.py68
-rw-r--r--youtube_dl/extractor/zype.py15
-rw-r--r--youtube_dl/options.py8
-rw-r--r--youtube_dl/postprocessor/embedthumbnail.py8
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py5
-rw-r--r--youtube_dl/utils.py61
-rw-r--r--youtube_dl/version.py2
147 files changed, 7858 insertions, 4851 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index aaac149e9..fe30758ef 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -163,6 +163,7 @@ class YoutubeDL(object):
simulate: Do not download the video files.
format: Video format code. See options.py for more information.
outtmpl: Template for output names.
+ outtmpl_na_placeholder: Placeholder for unavailable meta fields.
restrictfilenames: Do not allow "&" and spaces in file names
ignoreerrors: Do not stop on download errors.
force_generic_extractor: Force downloader to use the generic extractor
@@ -338,6 +339,8 @@ class YoutubeDL(object):
_pps = []
_download_retcode = None
_num_downloads = None
+ _playlist_level = 0
+ _playlist_urls = set()
_screen_file = None
def __init__(self, params=None, auto_init=True):
@@ -656,7 +659,7 @@ class YoutubeDL(object):
template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
for k, v in template_dict.items()
if v is not None and not isinstance(v, (list, tuple, dict)))
- template_dict = collections.defaultdict(lambda: 'NA', template_dict)
+ template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict)
outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
@@ -676,8 +679,8 @@ class YoutubeDL(object):
# Missing numeric fields used together with integer presentation types
# in format specification will break the argument substitution since
- # string 'NA' is returned for missing fields. We will patch output
- # template for missing fields to meet string presentation type.
+ # string NA placeholder is returned for missing fields. We will patch
+ # output template for missing fields to meet string presentation type.
for numeric_field in self._NUMERIC_FIELDS:
if numeric_field not in template_dict:
# As of [1] format syntax is:
@@ -770,11 +773,20 @@ class YoutubeDL(object):
def extract_info(self, url, download=True, ie_key=None, extra_info={},
process=True, force_generic_extractor=False):
- '''
- Returns a list with a dictionary for each video we find.
- If 'download', also downloads the videos.
- extra_info is a dict containing the extra values to add to each result
- '''
+ """
+ Return a list with a dictionary for each video extracted.
+
+ Arguments:
+ url -- URL to extract
+
+ Keyword arguments:
+ download -- whether to download videos during extraction
+ ie_key -- extractor key hint
+ extra_info -- dictionary containing the extra values to add to each result
+ process -- whether to resolve all unresolved references (URLs, playlist items),
+ must be True for download to work.
+ force_generic_extractor -- force using the generic extractor
+ """
if not ie_key and force_generic_extractor:
ie_key = 'Generic'
@@ -906,115 +918,23 @@ class YoutubeDL(object):
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
elif result_type in ('playlist', 'multi_video'):
- # We process each entry in the playlist
- playlist = ie_result.get('title') or ie_result.get('id')
- self.to_screen('[download] Downloading playlist: %s' % playlist)
-
- playlist_results = []
-
- playliststart = self.params.get('playliststart', 1) - 1
- playlistend = self.params.get('playlistend')
- # For backwards compatibility, interpret -1 as whole list
- if playlistend == -1:
- playlistend = None
-
- playlistitems_str = self.params.get('playlist_items')
- playlistitems = None
- if playlistitems_str is not None:
- def iter_playlistitems(format):
- for string_segment in format.split(','):
- if '-' in string_segment:
- start, end = string_segment.split('-')
- for item in range(int(start), int(end) + 1):
- yield int(item)
- else:
- yield int(string_segment)
- playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
-
- ie_entries = ie_result['entries']
-
- def make_playlistitems_entries(list_ie_entries):
- num_entries = len(list_ie_entries)
- return [
- list_ie_entries[i - 1] for i in playlistitems
- if -num_entries <= i - 1 < num_entries]
-
- def report_download(num_entries):
+ # Protect from infinite recursion due to recursively nested playlists
+ # (see https://github.com/ytdl-org/youtube-dl/issues/27833)
+ webpage_url = ie_result['webpage_url']
+ if webpage_url in self._playlist_urls:
self.to_screen(
- '[%s] playlist %s: Downloading %d videos' %
- (ie_result['extractor'], playlist, num_entries))
-
- if isinstance(ie_entries, list):
- n_all_entries = len(ie_entries)
- if playlistitems:
- entries = make_playlistitems_entries(ie_entries)
- else:
- entries = ie_entries[playliststart:playlistend]
- n_entries = len(entries)
- self.to_screen(
- '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
- (ie_result['extractor'], playlist, n_all_entries, n_entries))
- elif isinstance(ie_entries, PagedList):
- if playlistitems:
- entries = []
- for item in playlistitems:
- entries.extend(ie_entries.getslice(
- item - 1, item
- ))
- else:
- entries = ie_entries.getslice(
- playliststart, playlistend)
- n_entries = len(entries)
- report_download(n_entries)
- else: # iterable
- if playlistitems:
- entries = make_playlistitems_entries(list(itertools.islice(
- ie_entries, 0, max(playlistitems))))
- else:
- entries = list(itertools.islice(
- ie_entries, playliststart, playlistend))
- n_entries = len(entries)
- report_download(n_entries)
-
- if self.params.get('playlistreverse', False):
- entries = entries[::-1]
-
- if self.params.get('playlistrandom', False):
- random.shuffle(entries)
-
- x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
-
- for i, entry in enumerate(entries, 1):
- self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
- # This __x_forwarded_for_ip thing is a bit ugly but requires
- # minimal changes
- if x_forwarded_for:
- entry['__x_forwarded_for_ip'] = x_forwarded_for
- extra = {
- 'n_entries': n_entries,
- 'playlist': playlist,
- 'playlist_id': ie_result.get('id'),
- 'playlist_title': ie_result.get('title'),
- 'playlist_uploader': ie_result.get('uploader'),
- 'playlist_uploader_id': ie_result.get('uploader_id'),
- 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
- 'extractor': ie_result['extractor'],
- 'webpage_url': ie_result['webpage_url'],
- 'webpage_url_basename': url_basename(ie_result['webpage_url']),
- 'extractor_key': ie_result['extractor_key'],
- }
-
- reason = self._match_entry(entry, incomplete=True)
- if reason is not None:
- self.to_screen('[download] ' + reason)
- continue
+ '[download] Skipping already downloaded playlist: %s'
+ % ie_result.get('title') or ie_result.get('id'))
+ return
- entry_result = self.__process_iterable_entry(entry, download, extra)
- # TODO: skip failed (empty) entries?
- playlist_results.append(entry_result)
- ie_result['entries'] = playlist_results
- self.to_screen('[download] Finished downloading playlist: %s' % playlist)
- return ie_result
+ self._playlist_level += 1
+ self._playlist_urls.add(webpage_url)
+ try:
+ return self.__process_playlist(ie_result, download)
+ finally:
+ self._playlist_level -= 1
+ if not self._playlist_level:
+ self._playlist_urls.clear()
elif result_type == 'compat_list':
self.report_warning(
'Extractor %s returned a compat_list result. '
@@ -1039,6 +959,118 @@ class YoutubeDL(object):
else:
raise Exception('Invalid result type: %s' % result_type)
+ def __process_playlist(self, ie_result, download):
+ # We process each entry in the playlist
+ playlist = ie_result.get('title') or ie_result.get('id')
+
+ self.to_screen('[download] Downloading playlist: %s' % playlist)
+
+ playlist_results = []
+
+ playliststart = self.params.get('playliststart', 1) - 1
+ playlistend = self.params.get('playlistend')
+ # For backwards compatibility, interpret -1 as whole list
+ if playlistend == -1:
+ playlistend = None
+
+ playlistitems_str = self.params.get('playlist_items')
+ playlistitems = None
+ if playlistitems_str is not None:
+ def iter_playlistitems(format):
+ for string_segment in format.split(','):
+ if '-' in string_segment:
+ start, end = string_segment.split('-')
+ for item in range(int(start), int(end) + 1):
+ yield int(item)
+ else:
+ yield int(string_segment)
+ playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
+
+ ie_entries = ie_result['entries']
+
+ def make_playlistitems_entries(list_ie_entries):
+ num_entries = len(list_ie_entries)
+ return [
+ list_ie_entries[i - 1] for i in playlistitems
+ if -num_entries <= i - 1 < num_entries]
+
+ def report_download(num_entries):
+ self.to_screen(
+ '[%s] playlist %s: Downloading %d videos' %
+ (ie_result['extractor'], playlist, num_entries))
+
+ if isinstance(ie_entries, list):
+ n_all_entries = len(ie_entries)
+ if playlistitems:
+ entries = make_playlistitems_entries(ie_entries)
+ else:
+ entries = ie_entries[playliststart:playlistend]
+ n_entries = len(entries)
+ self.to_screen(
+ '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
+ (ie_result['extractor'], playlist, n_all_entries, n_entries))
+ elif isinstance(ie_entries, PagedList):
+ if playlistitems:
+ entries = []
+ for item in playlistitems:
+ entries.extend(ie_entries.getslice(
+ item - 1, item
+ ))
+ else:
+ entries = ie_entries.getslice(
+ playliststart, playlistend)
+ n_entries = len(entries)
+ report_download(n_entries)
+ else: # iterable
+ if playlistitems:
+ entries = make_playlistitems_entries(list(itertools.islice(
+ ie_entries, 0, max(playlistitems))))
+ else:
+ entries = list(itertools.islice(
+ ie_entries, playliststart, playlistend))
+ n_entries = len(entries)
+ report_download(n_entries)
+
+ if self.params.get('playlistreverse', False):
+ entries = entries[::-1]
+
+ if self.params.get('playlistrandom', False):
+ random.shuffle(entries)
+
+ x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
+
+ for i, entry in enumerate(entries, 1):
+ self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
+ # This __x_forwarded_for_ip thing is a bit ugly but requires
+ # minimal changes
+ if x_forwarded_for:
+ entry['__x_forwarded_for_ip'] = x_forwarded_for
+ extra = {
+ 'n_entries': n_entries,
+ 'playlist': playlist,
+ 'playlist_id': ie_result.get('id'),
+ 'playlist_title': ie_result.get('title'),
+ 'playlist_uploader': ie_result.get('uploader'),
+ 'playlist_uploader_id': ie_result.get('uploader_id'),
+ 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
+ 'extractor': ie_result['extractor'],
+ 'webpage_url': ie_result['webpage_url'],
+ 'webpage_url_basename': url_basename(ie_result['webpage_url']),
+ 'extractor_key': ie_result['extractor_key'],
+ }
+
+ reason = self._match_entry(entry, incomplete=True)
+ if reason is not None:
+ self.to_screen('[download] ' + reason)
+ continue
+
+ entry_result = self.__process_iterable_entry(entry, download, extra)
+ # TODO: skip failed (empty) entries?
+ playlist_results.append(entry_result)
+ ie_result['entries'] = playlist_results
+ self.to_screen('[download] Finished downloading playlist: %s' % playlist)
+ return ie_result
+
@__handle_extraction_exceptions
def __process_iterable_entry(self, entry, download, extra_info):
return self.process_ie_result(
@@ -1226,6 +1258,8 @@ class YoutubeDL(object):
group = _parse_format_selection(tokens, inside_group=True)
current_selector = FormatSelector(GROUP, group, [])
elif string == '+':
+ if inside_merge:
+ raise syntax_error('Unexpected "+"', start)
video_selector = current_selector
audio_selector = _parse_format_selection(tokens, inside_merge=True)
if not video_selector or not audio_selector:
@@ -1486,14 +1520,18 @@ class YoutubeDL(object):
if 'display_id' not in info_dict and 'id' in info_dict:
info_dict['display_id'] = info_dict['id']
- if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
- # Working around out-of-range timestamp values (e.g. negative ones on Windows,
- # see http://bugs.python.org/issue1646728)
- try:
- upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
- info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
- except (ValueError, OverflowError, OSError):
- pass
+ for ts_key, date_key in (
+ ('timestamp', 'upload_date'),
+ ('release_timestamp', 'release_date'),
+ ):
+ if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
+ # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+ # see http://bugs.python.org/issue1646728)
+ try:
+ upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
+ info_dict[date_key] = upload_date.strftime('%Y%m%d')
+ except (ValueError, OverflowError, OSError):
+ pass
# Auto generate title fields corresponding to the *_number fields when missing
# in order to always have clean titles. This is very common for TV series.
@@ -1777,6 +1815,8 @@ class YoutubeDL(object):
os.makedirs(dn)
return True
except (OSError, IOError) as err:
+ if isinstance(err, OSError) and err.errno == errno.EEXIST:
+ return True
self.report_error('unable to create directory ' + error_to_compat_str(err))
return False
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index 9a659fc65..e1bd67919 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -340,6 +340,7 @@ def _real_main(argv=None):
'format': opts.format,
'listformats': opts.listformats,
'outtmpl': outtmpl,
+ 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder,
'autonumber_size': opts.autonumber_size,
'autonumber_start': opts.autonumber_start,
'restrictfilenames': opts.restrictfilenames,
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 6c3d49d45..9e45c454b 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -73,6 +73,15 @@ try:
except ImportError: # Python 2
import Cookie as compat_cookies
+if sys.version_info[0] == 2:
+ class compat_cookies_SimpleCookie(compat_cookies.SimpleCookie):
+ def load(self, rawdata):
+ if isinstance(rawdata, compat_str):
+ rawdata = str(rawdata)
+ return super(compat_cookies_SimpleCookie, self).load(rawdata)
+else:
+ compat_cookies_SimpleCookie = compat_cookies.SimpleCookie
+
try:
import html.entities as compat_html_entities
except ImportError: # Python 2
@@ -3000,6 +3009,7 @@ __all__ = [
'compat_cookiejar',
'compat_cookiejar_Cookie',
'compat_cookies',
+ 'compat_cookies_SimpleCookie',
'compat_ctypes_WINFUNCTYPE',
'compat_etree_Element',
'compat_etree_fromstring',
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py
index 8b407bf9c..908c83377 100644
--- a/youtube_dl/extractor/abcnews.py
+++ b/youtube_dl/extractor/abcnews.py
@@ -1,14 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
-import calendar
import re
-import time
from .amp import AMPIE
from .common import InfoExtractor
-from .youtube import YoutubeIE
-from ..compat import compat_urlparse
+from ..utils import (
+ parse_duration,
+ parse_iso8601,
+ try_get,
+)
class AbcNewsVideoIE(AMPIE):
@@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE):
(?:
abcnews\.go\.com/
(?:
- [^/]+/video/(?P<display_id>[0-9a-z-]+)-|
- video/embed\?.*?\bid=
+ (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-|
+ video/(?:embed|itemfeed)\?.*?\bid=
)|
fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/
)
@@ -36,6 +37,8 @@ class AbcNewsVideoIE(AMPIE):
'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
'duration': 180,
'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1380454200,
+ 'upload_date': '20130929',
},
'params': {
# m3u8 download
@@ -47,6 +50,12 @@ class AbcNewsVideoIE(AMPIE):
}, {
'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
'only_matching': True,
+ }, {
+ 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -67,28 +76,23 @@ class AbcNewsIE(InfoExtractor):
_VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
_TESTS = [{
- 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
+ # Youtube Embeds
+ 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501',
'info_dict': {
- 'id': '10505354',
- 'ext': 'flv',
- 'display_id': 'dramatic-video-rare-death-job-america',
- 'title': 'Occupational Hazards',
- 'description': 'Nightline investigates the dangers that lurk at various jobs.',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'upload_date': '20100428',
- 'timestamp': 1272412800,
+ 'id': '51286501',
+ 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player",
+ 'description': 'Billingsley went from a child actor to Hollywood power player.',
},
- 'add_ie': ['AbcNewsVideo'],
+ 'playlist_count': 5,
}, {
'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
'info_dict': {
'id': '38897857',
'ext': 'mp4',
- 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
'title': 'Justin Timberlake Drops Hints For Secret Single',
'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
- 'upload_date': '20160515',
- 'timestamp': 1463329500,
+ 'upload_date': '20160505',
+ 'timestamp': 1462442280,
},
'params': {
# m3u8 download
@@ -100,49 +104,55 @@ class AbcNewsIE(InfoExtractor):
}, {
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True,
+ }, {
+ # inline.type == 'video'
+ 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
- video_id = mobj.group('id')
-
- webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(
- r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
- full_video_url = compat_urlparse.urljoin(url, video_url)
-
- youtube_url = YoutubeIE._extract_url(webpage)
-
- timestamp = None
- date_str = self._html_search_regex(
- r'<span[^>]+class="timestamp">([^<]+)</span>',
- webpage, 'timestamp', fatal=False)
- if date_str:
- tz_offset = 0
- if date_str.endswith(' ET'): # Eastern Time
- tz_offset = -5
- date_str = date_str[:-3]
- date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
- for date_format in date_formats:
- try:
- timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
- except ValueError:
- continue
- if timestamp is not None:
- timestamp -= tz_offset * 3600
-
- entry = {
- '_type': 'url_transparent',
- 'ie_key': AbcNewsVideoIE.ie_key(),
- 'url': full_video_url,
- 'id': video_id,
- 'display_id': display_id,
- 'timestamp': timestamp,
- }
-
- if youtube_url:
- entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]
- return self.playlist_result(entries)
-
- return entry
+ story_id = self._match_id(url)
+ webpage = self._download_webpage(url, story_id)
+ story = self._parse_json(self._search_regex(
+ r"window\['__abcnews__'\]\s*=\s*({.+?});",
+ webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0]
+ article_contents = story.get('articleContents') or {}
+
+ def entries():
+ featured_video = story.get('featuredVideo') or {}
+ feed = try_get(featured_video, lambda x: x['video']['feed'])
+ if feed:
+ yield {
+ '_type': 'url',
+ 'id': featured_video.get('id'),
+ 'title': featured_video.get('name'),
+ 'url': feed,
+ 'thumbnail': featured_video.get('images'),
+ 'description': featured_video.get('description'),
+ 'timestamp': parse_iso8601(featured_video.get('uploadDate')),
+ 'duration': parse_duration(featured_video.get('duration')),
+ 'ie_key': AbcNewsVideoIE.ie_key(),
+ }
+
+ for inline in (article_contents.get('inlines') or []):
+ inline_type = inline.get('type')
+ if inline_type == 'iframe':
+ iframe_url = try_get(inline, lambda x: x['attrs']['src'])
+ if iframe_url:
+ yield self.url_result(iframe_url)
+ elif inline_type == 'video':
+ video_id = inline.get('id')
+ if video_id:
+ yield {
+ '_type': 'url',
+ 'id': video_id,
+ 'url': 'http://abcnews.go.com/video/embed?id=' + video_id,
+ 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'),
+ 'description': inline.get('description'),
+ 'duration': parse_duration(inline.get('duration')),
+ 'ie_key': AbcNewsVideoIE.ie_key(),
+ }
+
+ return self.playlist_result(
+ entries(), story_id, article_contents.get('headline'),
+ article_contents.get('subHead'))
diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py
index c95ad2173..a55ebbcbd 100644
--- a/youtube_dl/extractor/adn.py
+++ b/youtube_dl/extractor/adn.py
@@ -10,6 +10,7 @@ import random
from .common import InfoExtractor
from ..aes import aes_cbc_decrypt
from ..compat import (
+ compat_HTTPError,
compat_b64decode,
compat_ord,
)
@@ -18,11 +19,14 @@ from ..utils import (
bytes_to_long,
ExtractorError,
float_or_none,
+ int_or_none,
intlist_to_bytes,
long_to_bytes,
pkcs1pad,
strip_or_none,
- urljoin,
+ try_get,
+ unified_strdate,
+ urlencode_postdata,
)
@@ -31,16 +35,30 @@ class ADNIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
_TEST = {
'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
- 'md5': 'e497370d847fd79d9d4c74be55575c7a',
+ 'md5': '0319c99885ff5547565cacb4f3f9348d',
'info_dict': {
'id': '7778',
'ext': 'mp4',
- 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1',
+ 'title': 'Blue Exorcist - Kyôto Saga - Episode 1',
'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
+ 'series': 'Blue Exorcist - Kyôto Saga',
+ 'duration': 1467,
+ 'release_date': '20170106',
+ 'comment_count': int,
+ 'average_rating': float,
+ 'season_number': 2,
+ 'episode': 'Début des hostilités',
+ 'episode_number': 1,
}
}
+
+ _NETRC_MACHINE = 'animedigitalnetwork'
_BASE_URL = 'http://animedigitalnetwork.fr'
- _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537)
+ _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/'
+ _PLAYER_BASE_URL = _API_BASE_URL + 'player/'
+ _HEADERS = {}
+ _LOGIN_ERR_MESSAGE = 'Unable to log in'
+ _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537)
_POS_ALIGN_MAP = {
'start': 1,
'end': 3,
@@ -54,26 +72,24 @@ class ADNIE(InfoExtractor):
def _ass_subtitles_timecode(seconds):
return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100)
- def _get_subtitles(self, sub_path, video_id):
- if not sub_path:
+ def _get_subtitles(self, sub_url, video_id):
+ if not sub_url:
return None
enc_subtitles = self._download_webpage(
- urljoin(self._BASE_URL, sub_path),
- video_id, 'Downloading subtitles location', fatal=False) or '{}'
+ sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}'
subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location')
if subtitle_location:
enc_subtitles = self._download_webpage(
- urljoin(self._BASE_URL, subtitle_location),
- video_id, 'Downloading subtitles data', fatal=False,
- headers={'Origin': 'https://animedigitalnetwork.fr'})
+ subtitle_location, video_id, 'Downloading subtitles data',
+ fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'})
if not enc_subtitles:
return None
# http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
bytes_to_intlist(compat_b64decode(enc_subtitles[24:])),
- bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')),
+ bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')),
bytes_to_intlist(compat_b64decode(enc_subtitles[:24]))
))
subtitles_json = self._parse_json(
@@ -117,61 +133,100 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
}])
return subtitles
+ def _real_initialize(self):
+ username, password = self._get_login_info()
+ if not username:
+ return
+ try:
+ access_token = (self._download_json(
+ self._API_BASE_URL + 'authentication/login', None,
+ 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False,
+ data=urlencode_postdata({
+ 'password': password,
+ 'rememberMe': False,
+ 'source': 'Web',
+ 'username': username,
+ })) or {}).get('accessToken')
+ if access_token:
+ self._HEADERS = {'authorization': 'Bearer ' + access_token}
+ except ExtractorError as e:
+ message = None
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ resp = self._parse_json(
+ e.cause.read().decode(), None, fatal=False) or {}
+ message = resp.get('message') or resp.get('code')
+ self.report_warning(message or self._LOGIN_ERR_MESSAGE)
+
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- player_config = self._parse_json(self._search_regex(
- r'playerConfig\s*=\s*({.+});', webpage,
- 'player config', default='{}'), video_id, fatal=False)
- if not player_config:
- config_url = urljoin(self._BASE_URL, self._search_regex(
- r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"',
- webpage, 'config url'))
- player_config = self._download_json(
- config_url, video_id,
- 'Downloading player config JSON metadata')['player']
-
- video_info = {}
- video_info_str = self._search_regex(
- r'videoInfo\s*=\s*({.+});', webpage,
- 'video info', fatal=False)
- if video_info_str:
- video_info = self._parse_json(
- video_info_str, video_id, fatal=False) or {}
-
- options = player_config.get('options') or {}
- metas = options.get('metas') or {}
- links = player_config.get('links') or {}
- sub_path = player_config.get('subtitles')
- error = None
- if not links:
- links_url = player_config.get('linksurl') or options['videoUrl']
- token = options['token']
- self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
- message = bytes_to_intlist(json.dumps({
- 'k': self._K,
- 'e': 60,
- 't': token,
- }))
+ video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id
+ player = self._download_json(
+ video_base_url + 'configuration', video_id,
+ 'Downloading player config JSON metadata',
+ headers=self._HEADERS)['player']
+ options = player['options']
+
+ user = options['user']
+ if not user.get('hasAccess'):
+ self.raise_login_required()
+
+ token = self._download_json(
+ user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'),
+ video_id, 'Downloading access token', headers={
+ 'x-player-refresh-token': user['refreshToken']
+ }, data=b'')['token']
+
+ links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link')
+ self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)])
+ message = bytes_to_intlist(json.dumps({
+ 'k': self._K,
+ 't': token,
+ }))
+
+ # Sometimes authentication fails for no good reason, retry with
+ # a different random padding
+ links_data = None
+ for _ in range(3):
padded_message = intlist_to_bytes(pkcs1pad(message, 128))
n, e = self._RSA_KEY
encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n))
authorization = base64.b64encode(encrypted_message).decode()
- links_data = self._download_json(
- urljoin(self._BASE_URL, links_url), video_id,
- 'Downloading links JSON metadata', headers={
- 'Authorization': 'Bearer ' + authorization,
- })
- links = links_data.get('links') or {}
- metas = metas or links_data.get('meta') or {}
- sub_path = sub_path or links_data.get('subtitles') or \
- 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id
- sub_path += '&token=' + token
- error = links_data.get('error')
- title = metas.get('title') or video_info['title']
+
+ try:
+ links_data = self._download_json(
+ links_url, video_id, 'Downloading links JSON metadata', headers={
+ 'X-Player-Token': authorization
+ }, query={
+ 'freeWithAds': 'true',
+ 'adaptive': 'false',
+ 'withMetadata': 'true',
+ 'source': 'Web'
+ })
+ break
+ except ExtractorError as e:
+ if not isinstance(e.cause, compat_HTTPError):
+ raise e
+
+ if e.cause.code == 401:
+ # This usually goes away with a different random pkcs1pad, so retry
+ continue
+
+ error = self._parse_json(e.cause.read(), video_id)
+ message = error.get('message')
+ if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country':
+ self.raise_geo_restricted(msg=message)
+ raise ExtractorError(message)
+ else:
+ raise ExtractorError('Giving up retrying')
+
+ links = links_data.get('links') or {}
+ metas = links_data.get('metadata') or {}
+ sub_url = (links.get('subtitles') or {}).get('all')
+ video_info = links_data.get('video') or {}
+ title = metas['title']
formats = []
- for format_id, qualities in links.items():
+ for format_id, qualities in (links.get('streaming') or {}).items():
if not isinstance(qualities, dict):
continue
for quality, load_balancer_url in qualities.items():
@@ -189,19 +244,26 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text'''
for f in m3u8_formats:
f['language'] = 'fr'
formats.extend(m3u8_formats)
- if not error:
- error = options.get('error')
- if not formats and error:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats)
+ video = (self._download_json(
+ self._API_BASE_URL + 'video/%s' % video_id, video_id,
+ 'Downloading additional video metadata', fatal=False) or {}).get('video') or {}
+ show = video.get('show') or {}
+
return {
'id': video_id,
'title': title,
- 'description': strip_or_none(metas.get('summary') or video_info.get('resume')),
- 'thumbnail': video_info.get('image'),
+ 'description': strip_or_none(metas.get('summary') or video.get('summary')),
+ 'thumbnail': video_info.get('image') or player.get('image'),
'formats': formats,
- 'subtitles': self.extract_subtitles(sub_path, video_id),
- 'episode': metas.get('subtitle') or video_info.get('videoTitle'),
- 'series': video_info.get('playlistTitle'),
+ 'subtitles': self.extract_subtitles(sub_url, video_id),
+ 'episode': metas.get('subtitle') or video.get('name'),
+ 'episode_number': int_or_none(video.get('shortNumber')),
+ 'series': show.get('title'),
+ 'season_number': int_or_none(video.get('season')),
+ 'duration': int_or_none(video_info.get('duration') or video.get('duration')),
+ 'release_date': unified_strdate(video.get('releaseDate')),
+ 'average_rating': float_or_none(video.get('rating') or metas.get('rating')),
+ 'comment_count': int_or_none(video.get('commentsCount')),
}
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index 8e4963131..e55c03fd7 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -252,11 +252,11 @@ class AENetworksShowIE(AENetworksListBaseIE):
_TESTS = [{
'url': 'http://www.history.com/shows/ancient-aliens',
'info_dict': {
- 'id': 'SH012427480000',
+ 'id': 'SERIES1574',
'title': 'Ancient Aliens',
'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f',
},
- 'playlist_mincount': 168,
+ 'playlist_mincount': 150,
}]
_RESOURCE = 'series'
_ITEMS_KEY = 'episodes'
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
index c68be3134..c4f915a3c 100644
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@@ -1,13 +1,16 @@
from __future__ import unicode_literals
+import json
+import re
+
from .common import InfoExtractor
class AlJazeeraIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)'
_TESTS = [{
- 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
+ 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance',
'info_dict': {
'id': '3792260579001',
'ext': 'mp4',
@@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor):
'add_ie': ['BrightcoveNew'],
'skip': 'Not accessible from Travis CI server',
}, {
- 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+ 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art',
'only_matching': True,
}]
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _real_extract(self, url):
- program_name = self._match_id(url)
- webpage = self._download_webpage(url, program_name)
- brightcove_id = self._search_regex(
- r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id')
- return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+ post_type, name = re.match(self._VALID_URL, url).groups()
+ post_type = {
+ 'features': 'post',
+ 'program': 'episode',
+ 'videos': 'video',
+ }[post_type.split('/')[0]]
+ video = self._download_json(
+ 'https://www.aljazeera.com/graphql', name, query={
+ 'operationName': 'SingleArticleQuery',
+ 'variables': json.dumps({
+ 'name': name,
+ 'postType': post_type,
+ }),
+ }, headers={
+ 'wp-site': 'aje',
+ })['data']['article']['video']
+ video_id = video['id']
+ account_id = video.get('accountId') or '665003303001'
+ player_id = video.get('playerId') or 'BkeSH5BDb'
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
+ 'BrightcoveNew', video_id)
diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py
index e20f00fc3..be960c0f9 100644
--- a/youtube_dl/extractor/americastestkitchen.py
+++ b/youtube_dl/extractor/americastestkitchen.py
@@ -1,13 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
+ int_or_none,
try_get,
unified_strdate,
+ unified_timestamp,
)
@@ -22,8 +25,8 @@ class AmericasTestKitchenIE(InfoExtractor):
'ext': 'mp4',
'description': 'md5:64e606bfee910627efc4b5f050de92b3',
'thumbnail': r're:^https?://',
- 'timestamp': 1523664000,
- 'upload_date': '20180414',
+ 'timestamp': 1523318400,
+ 'upload_date': '20180410',
'release_date': '20180410',
'series': "America's Test Kitchen",
'season_number': 18,
@@ -34,6 +37,27 @@ class AmericasTestKitchenIE(InfoExtractor):
'skip_download': True,
},
}, {
+ # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above)
+ 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner',
+ 'md5': '06451608c57651e985a498e69cec17e5',
+ 'info_dict': {
+ 'id': '5fbe8c61bda2010001c6763b',
+ 'title': 'Simple Chicken Dinner',
+ 'ext': 'mp4',
+ 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7',
+ 'thumbnail': r're:^https?://',
+ 'timestamp': 1610755200,
+ 'upload_date': '20210116',
+ 'release_date': '20210116',
+ 'series': "America's Test Kitchen",
+ 'season_number': 21,
+ 'episode': 'Simple Chicken Dinner',
+ 'episode_number': 3,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon',
'only_matching': True,
}, {
@@ -60,7 +84,76 @@ class AmericasTestKitchenIE(InfoExtractor):
'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'],
'ie_key': 'Zype',
'description': clean_html(video.get('description')),
+ 'timestamp': unified_timestamp(video.get('publishDate')),
'release_date': unified_strdate(video.get('publishDate')),
+ 'episode_number': int_or_none(episode.get('number')),
+ 'season_number': int_or_none(episode.get('season')),
'series': try_get(episode, lambda x: x['show']['title']),
'episode': episode.get('title'),
}
+
+
+class AmericasTestKitchenSeasonIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)'
+ _TESTS = [{
+ # ATK Season
+ 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1',
+ 'info_dict': {
+ 'id': 'season_1',
+ 'title': 'Season 1',
+ },
+ 'playlist_count': 13,
+ }, {
+ # Cooks Country Season
+ 'url': 'https://www.cookscountry.com/episodes/browse/season_12',
+ 'info_dict': {
+ 'id': 'season_12',
+ 'title': 'Season 12',
+ },
+ 'playlist_count': 13,
+ }]
+
+ def _real_extract(self, url):
+ show_name, season_number = re.match(self._VALID_URL, url).groups()
+ season_number = int(season_number)
+
+ slug = 'atk' if show_name == 'americastestkitchen' else 'cco'
+
+ season = 'Season %d' % season_number
+
+ season_search = self._download_json(
+ 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug,
+ season, headers={
+ 'Origin': 'https://www.%s.com' % show_name,
+ 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805',
+ 'X-Algolia-Application-Id': 'Y1FNZXUI30',
+ }, query={
+ 'facetFilters': json.dumps([
+ 'search_season_list:' + season,
+ 'search_document_klass:episode',
+ 'search_show_slug:' + slug,
+ ]),
+ 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug,
+ 'attributesToHighlight': '',
+ 'hitsPerPage': 1000,
+ })
+
+ def entries():
+ for episode in (season_search.get('hits') or []):
+ search_url = episode.get('search_url')
+ if not search_url:
+ continue
+ yield {
+ '_type': 'url',
+ 'url': 'https://www.%s.com%s' % (show_name, search_url),
+ 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]),
+ 'title': episode.get('title'),
+ 'description': episode.get('description'),
+ 'timestamp': unified_timestamp(episode.get('search_document_date')),
+ 'season_number': season_number,
+ 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)),
+ 'ie_key': AmericasTestKitchenIE.ie_key(),
+ }
+
+ return self.playlist_result(
+ entries(), 'season_%d' % season_number, season)
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index 7ff098cfa..24c684cad 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -8,6 +8,7 @@ from ..utils import (
int_or_none,
mimetype2ext,
parse_iso8601,
+ unified_timestamp,
url_or_none,
)
@@ -88,7 +89,7 @@ class AMPIE(InfoExtractor):
self._sort_formats(formats)
- timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
+ timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py
index 00ce684d1..54e097d2f 100644
--- a/youtube_dl/extractor/animeondemand.py
+++ b/youtube_dl/extractor/animeondemand.py
@@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor):
r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>',
webpage, 'anime description', default=None)
- entries = []
-
def extract_info(html, video_id, num=None):
title, description = [None] * 2
formats = []
@@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor):
self._sort_formats(info['formats'])
f = common_info.copy()
f.update(info)
- entries.append(f)
+ yield f
# Extract teaser/trailer only when full episode is not available
if not info['formats']:
@@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor):
'title': m.group('title'),
'url': urljoin(url, m.group('href')),
})
- entries.append(f)
+ yield f
def extract_episodes(html):
for num, episode_html in enumerate(re.findall(
@@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor):
'episode_number': episode_number,
}
- extract_entries(episode_html, video_id, common_info)
+ for e in extract_entries(episode_html, video_id, common_info):
+ yield e
def extract_film(html, video_id):
common_info = {
@@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor):
'title': anime_title,
'description': anime_description,
}
- extract_entries(html, video_id, common_info)
+ for e in extract_entries(html, video_id, common_info):
+ yield e
- extract_episodes(webpage)
+ def entries():
+ has_episodes = False
+ for e in extract_episodes(webpage):
+ has_episodes = True
+ yield e
- if not entries:
- extract_film(webpage, anime_id)
+ if not has_episodes:
+ for e in extract_film(webpage, anime_id):
+ yield e
- return self.playlist_result(entries, anime_id, anime_title, anime_description)
+ return self.playlist_result(
+ entries(), anime_id, anime_title, anime_description)
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
index e87994a6a..f6ecb8438 100644
--- a/youtube_dl/extractor/aol.py
+++ b/youtube_dl/extractor/aol.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
-from .common import InfoExtractor
+from .yahoo import YahooIE
from ..compat import (
compat_parse_qs,
compat_urllib_parse_urlparse,
@@ -15,9 +15,9 @@ from ..utils import (
)
-class AolIE(InfoExtractor):
+class AolIE(YahooIE):
IE_NAME = 'aol.com'
- _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>[0-9a-f]+)'
+ _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})'
_TESTS = [{
# video with 5min ID
@@ -76,10 +76,16 @@ class AolIE(InfoExtractor):
}, {
'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/',
'only_matching': True,
+ }, {
+ # Yahoo video
+ 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ if '-' in video_id:
+ return self._extract_yahoo_video(video_id, 'us')
response = self._download_json(
'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id,
diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py
index 98ccdaa4a..cbc1c0ecb 100644
--- a/youtube_dl/extractor/apa.py
+++ b/youtube_dl/extractor/apa.py
@@ -6,25 +6,21 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
- js_to_json,
+ int_or_none,
url_or_none,
)
class APAIE(InfoExtractor):
- _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029',
'md5': '2b12292faeb0a7d930c778c7a5b4759b',
'info_dict': {
- 'id': 'jjv85FdZ',
+ 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029',
'ext': 'mp4',
- 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview',
- 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 254,
- 'timestamp': 1519211149,
- 'upload_date': '20180221',
},
}, {
'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78',
@@ -46,9 +42,11 @@ class APAIE(InfoExtractor):
webpage)]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id, base_url = mobj.group('id', 'base_url')
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(
+ '%s/player/%s' % (base_url, video_id), video_id)
jwplatform_id = self._search_regex(
r'media[iI]d\s*:\s*["\'](?P<id>[a-zA-Z0-9]{8})', webpage,
@@ -59,16 +57,18 @@ class APAIE(InfoExtractor):
'jwplatform:' + jwplatform_id, ie='JWPlatform',
video_id=video_id)
- sources = self._parse_json(
- self._search_regex(
- r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'),
- video_id, transform_source=js_to_json)
+ def extract(field, name=None):
+ return self._search_regex(
+ r'\b%s["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % field,
+ webpage, name or field, default=None, group='value')
+
+ title = extract('title') or video_id
+ description = extract('description')
+ thumbnail = extract('poster', 'thumbnail')
formats = []
- for source in sources:
- if not isinstance(source, dict):
- continue
- source_url = url_or_none(source.get('file'))
+ for format_id in ('hls', 'progressive'):
+ source_url = url_or_none(extract(format_id))
if not source_url:
continue
ext = determine_ext(source_url)
@@ -77,18 +77,19 @@ class APAIE(InfoExtractor):
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
else:
+ height = int_or_none(self._search_regex(
+ r'(\d+)\.mp4', source_url, 'height', default=None))
formats.append({
'url': source_url,
+ 'format_id': format_id,
+ 'height': height,
})
self._sort_formats(formats)
- thumbnail = self._search_regex(
- r'image\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
- 'thumbnail', fatal=False, group='url')
-
return {
'id': video_id,
- 'title': video_id,
+ 'title': title,
+ 'description': description,
'thumbnail': thumbnail,
'formats': formats,
}
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
index a84b8b1eb..494f8330c 100644
--- a/youtube_dl/extractor/appleconnect.py
+++ b/youtube_dl/extractor/appleconnect.py
@@ -9,10 +9,10 @@ from ..utils import (
class AppleConnectIE(InfoExtractor):
- _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
- _TEST = {
+ _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P<id>[\w-]+)'
+ _TESTS = [{
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
- 'md5': 'e7c38568a01ea45402570e6029206723',
+ 'md5': 'c1d41f72c8bcaf222e089434619316e4',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
@@ -22,7 +22,10 @@ class AppleConnectIE(InfoExtractor):
'upload_date': '20150710',
'timestamp': 1436545535,
},
- }
+ }, {
+ 'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -36,7 +39,7 @@ class AppleConnectIE(InfoExtractor):
video_data = self._parse_json(video_json, video_id)
timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
- like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count'))
+ like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/applepodcasts.py b/youtube_dl/extractor/applepodcasts.py
index 95758fece..6a74de758 100644
--- a/youtube_dl/extractor/applepodcasts.py
+++ b/youtube_dl/extractor/applepodcasts.py
@@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor):
ember_data = self._parse_json(self._search_regex(
r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
webpage, 'ember data'), episode_id)
+ ember_data = ember_data.get(episode_id) or ember_data
episode = ember_data['data']['attributes']
description = episode.get('description') or {}
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index c79c58e82..e42ed5e79 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -2,15 +2,17 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
clean_html,
+ extract_attributes,
+ unified_strdate,
+ unified_timestamp,
)
class ArchiveOrgIE(InfoExtractor):
IE_NAME = 'archive.org'
IE_DESC = 'archive.org videos'
- _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
+ _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'md5': '8af1d4cf447933ed3c7f4871162602db',
@@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor):
'ext': 'ogg',
'title': '1968 Demo - FJCC Conference Presentation Reel #1',
'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
- 'upload_date': '19681210',
- 'uploader': 'SRI International'
+ 'creator': 'SRI International',
+ 'release_date': '19681210',
+ 'uploader': 'SRI International',
+ 'timestamp': 1268695290,
+ 'upload_date': '20100315',
}
}, {
'url': 'https://archive.org/details/Cops1922',
@@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor):
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
- 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
+ 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c',
+ 'timestamp': 1387699629,
+ 'upload_date': '20131222',
}
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
'only_matching': True,
+ }, {
+ 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://archive.org/embed/' + video_id, video_id)
- jwplayer_playlist = self._parse_json(self._search_regex(
- r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
- webpage, 'jwplayer playlist'), video_id)
- info = self._parse_jwplayer_data(
- {'playlist': jwplayer_playlist}, video_id, base_url=url)
+
+ playlist = None
+ play8 = self._search_regex(
+ r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage,
+ 'playlist', default=None)
+ if play8:
+ attrs = extract_attributes(play8)
+ playlist = attrs.get('value')
+ if not playlist:
+ # Old jwplayer fallback
+ playlist = self._search_regex(
+ r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
+ webpage, 'jwplayer playlist', default='[]')
+ jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False)
+ if jwplayer_playlist:
+ info = self._parse_jwplayer_data(
+ {'playlist': jwplayer_playlist}, video_id, base_url=url)
+ else:
+ # HTML5 media fallback
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+ info['id'] = video_id
def get_optional(metadata, field):
return metadata.get(field, [None])[0]
@@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor):
'description': clean_html(get_optional(metadata, 'description')),
})
if info.get('_type') != 'playlist':
+ creator = get_optional(metadata, 'creator')
info.update({
- 'uploader': get_optional(metadata, 'creator'),
- 'upload_date': unified_strdate(get_optional(metadata, 'date')),
+ 'creator': creator,
+ 'release_date': unified_strdate(get_optional(metadata, 'date')),
+ 'uploader': get_optional(metadata, 'publisher') or creator,
+ 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')),
+ 'language': get_optional(metadata, 'language'),
})
return info
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 5b7b2dd6d..d45a9fe52 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -187,13 +187,13 @@ class ARDMediathekIE(ARDMediathekBaseIE):
if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc)
- title = self._html_search_regex(
+ title = self._og_search_title(webpage, default=None) or self._html_search_regex(
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
r'<meta name="dcterms\.title" content="(.*?)"/>',
r'<h4 class="headline">(.*?)</h4>',
r'<title[^>]*>(.*?)</title>'],
webpage, 'title')
- description = self._html_search_meta(
+ description = self._og_search_description(webpage, default=None) or self._html_search_meta(
'dcterms.abstract', webpage, 'description', default=None)
if description is None:
description = self._html_search_meta(
@@ -249,31 +249,40 @@ class ARDMediathekIE(ARDMediathekBaseIE):
class ARDIE(InfoExtractor):
- _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+ _VALID_URL = r'(?P<mainurl>https?://(?:www\.)?daserste\.de/(?:[^/?#&]+/)+(?P<id>[^/?#&]+))\.html'
_TESTS = [{
- # available till 14.02.2019
- 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
- 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49',
+ # available till 7.01.2022
+ 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html',
+ 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1',
'info_dict': {
- 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video',
- 'id': '102',
+ 'id': 'maischberger-die-woche-video100',
+ 'display_id': 'maischberger-die-woche-video100',
'ext': 'mp4',
- 'duration': 4435.0,
- 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?',
- 'upload_date': '20180214',
+ 'duration': 3687.0,
+ 'title': 'maischberger. die woche vom 7. Januar 2021',
+ 'upload_date': '20210107',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
- 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+ 'url': 'https://www.daserste.de/information/politik-weltgeschehen/morgenmagazin/videosextern/dominik-kahun-aus-der-nhl-direkt-zur-weltmeisterschaft-100.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.daserste.de/information/nachrichten-wetter/tagesthemen/videosextern/tagesthemen-17736.html',
'only_matching': True,
}, {
'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
'only_matching': True,
+ }, {
+ 'url': 'https://www.daserste.de/unterhaltung/serie/in-aller-freundschaft-die-jungen-aerzte/Drehpause-100.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.daserste.de/unterhaltung/film/filmmittwoch-im-ersten/videos/making-ofwendezeit-video-100.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- display_id = mobj.group('display_id')
+ display_id = mobj.group('id')
player_url = mobj.group('mainurl') + '~playerXml.xml'
doc = self._download_xml(player_url, display_id)
@@ -284,25 +293,47 @@ class ARDIE(InfoExtractor):
formats = []
for a in video_node.findall('.//asset'):
+ file_name = xpath_text(a, './fileName', default=None)
+ if not file_name:
+ continue
+ format_type = a.attrib.get('type')
+ format_url = url_or_none(file_name)
+ if format_url:
+ ext = determine_ext(file_name)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_type or 'hls', fatal=False))
+ continue
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ update_url_query(format_url, {'hdcore': '3.7.0'}),
+ display_id, f4m_id=format_type or 'hds', fatal=False))
+ continue
f = {
- 'format_id': a.attrib['type'],
- 'width': int_or_none(a.find('./frameWidth').text),
- 'height': int_or_none(a.find('./frameHeight').text),
- 'vbr': int_or_none(a.find('./bitrateVideo').text),
- 'abr': int_or_none(a.find('./bitrateAudio').text),
- 'vcodec': a.find('./codecVideo').text,
- 'tbr': int_or_none(a.find('./totalBitrate').text),
+ 'format_id': format_type,
+ 'width': int_or_none(xpath_text(a, './frameWidth')),
+ 'height': int_or_none(xpath_text(a, './frameHeight')),
+ 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
+ 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
+ 'vcodec': xpath_text(a, './codecVideo'),
+ 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
}
- if a.find('./serverPrefix').text:
- f['url'] = a.find('./serverPrefix').text
- f['playpath'] = a.find('./fileName').text
+ server_prefix = xpath_text(a, './serverPrefix', default=None)
+ if server_prefix:
+ f.update({
+ 'url': server_prefix,
+ 'playpath': file_name,
+ })
else:
- f['url'] = a.find('./fileName').text
+ if not format_url:
+ continue
+ f['url'] = format_url
formats.append(f)
self._sort_formats(formats)
return {
- 'id': mobj.group('id'),
+ 'id': xpath_text(video_node, './videoId', default=display_id),
'formats': formats,
'display_id': display_id,
'title': video_node.find('./title').text,
@@ -313,19 +344,19 @@ class ARDIE(InfoExtractor):
class ARDBetaMediathekIE(ARDMediathekBaseIE):
- _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?:[^/]+/)?(?:player|live|video)/(?:[^/]+/)*(?P<id>Y3JpZDovL[a-zA-Z0-9]+)'
_TESTS = [{
- 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
- 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
+ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/',
+ 'md5': 'a1dc75a39c61601b980648f7c9f9f71d',
'info_dict': {
'display_id': 'die-robuste-roswita',
- 'id': '70153354',
+ 'id': '78566716',
'title': 'Die robuste Roswita',
- 'description': r're:^Der Mord.*trüber ist als die Ilm.',
+ 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita',
'duration': 5316,
- 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard',
- 'timestamp': 1577047500,
- 'upload_date': '20191222',
+ 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard',
+ 'timestamp': 1596658200,
+ 'upload_date': '20200805',
'ext': 'mp4',
},
}, {
@@ -343,22 +374,22 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}, {
'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('video_id')
- display_id = mobj.group('display_id')
- if display_id:
- display_id = display_id.rstrip('/')
- if not display_id:
- display_id = video_id
+ video_id = self._match_id(url)
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
- display_id, data=json.dumps({
+ video_id, data=json.dumps({
'query': '''{
- playerPage(client:"%s", clipId: "%s") {
+ playerPage(client: "ard", clipId: "%s") {
blockedByFsk
broadcastedOn
maturityContentRating
@@ -388,7 +419,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
}
}
}
-}''' % (mobj.group('client'), video_id),
+}''' % video_id,
}).encode(), headers={
'Content-Type': 'application/json'
})['data']['playerPage']
@@ -413,7 +444,6 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE):
r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None))
info.update({
'age_limit': age_limit,
- 'display_id': display_id,
'title': title,
'description': description,
'timestamp': unified_timestamp(player_page.get('broadcastedOn')),
diff --git a/youtube_dl/extractor/arnes.py b/youtube_dl/extractor/arnes.py
new file mode 100644
index 000000000..c0032fcab
--- /dev/null
+++ b/youtube_dl/extractor/arnes.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
+ remove_start,
+)
+
+
+class ArnesIE(InfoExtractor):
+ IE_NAME = 'video.arnes.si'
+ IE_DESC = 'Arnes Video'
+ _VALID_URL = r'https?://video\.arnes\.si/(?:[a-z]{2}/)?(?:watch|embed|api/(?:asset|public/video))/(?P<id>[0-9a-zA-Z]{12})'
+ _TESTS = [{
+ 'url': 'https://video.arnes.si/watch/a1qrWTOQfVoU?t=10',
+ 'md5': '4d0f4d0a03571b33e1efac25fd4a065d',
+ 'info_dict': {
+ 'id': 'a1qrWTOQfVoU',
+ 'ext': 'mp4',
+ 'title': 'Linearna neodvisnost, definicija',
+ 'description': 'Linearna neodvisnost, definicija',
+ 'license': 'PRIVATE',
+ 'creator': 'Polona Oblak',
+ 'timestamp': 1585063725,
+ 'upload_date': '20200324',
+ 'channel': 'Polona Oblak',
+ 'channel_id': 'q6pc04hw24cj',
+ 'channel_url': 'https://video.arnes.si/?channel=q6pc04hw24cj',
+ 'duration': 596.75,
+ 'view_count': int,
+ 'tags': ['linearna_algebra'],
+ 'start_time': 10,
+ }
+ }, {
+ 'url': 'https://video.arnes.si/api/asset/s1YjnV7hadlC/play.mp4',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.arnes.si/en/watch/s1YjnV7hadlC',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.arnes.si/embed/s1YjnV7hadlC?t=123&hideRelated=1',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.arnes.si/api/public/video/s1YjnV7hadlC',
+ 'only_matching': True,
+ }]
+ _BASE_URL = 'https://video.arnes.si'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ self._BASE_URL + '/api/public/video/' + video_id, video_id)['data']
+ title = video['title']
+
+ formats = []
+ for media in (video.get('media') or []):
+ media_url = media.get('url')
+ if not media_url:
+ continue
+ formats.append({
+ 'url': self._BASE_URL + media_url,
+ 'format_id': remove_start(media.get('format'), 'FORMAT_'),
+ 'format_note': media.get('formatTranslation'),
+ 'width': int_or_none(media.get('width')),
+ 'height': int_or_none(media.get('height')),
+ })
+ self._sort_formats(formats)
+
+ channel = video.get('channel') or {}
+ channel_id = channel.get('url')
+ thumbnail = video.get('thumbnailUrl')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': self._BASE_URL + thumbnail,
+ 'description': video.get('description'),
+ 'license': video.get('license'),
+ 'creator': video.get('author'),
+ 'timestamp': parse_iso8601(video.get('creationTime')),
+ 'channel': channel.get('name'),
+ 'channel_id': channel_id,
+ 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None,
+ 'duration': float_or_none(video.get('duration'), 1000),
+ 'view_count': int_or_none(video.get('views')),
+ 'tags': video.get('hashtags'),
+ 'start_time': int_or_none(compat_parse_qs(
+ compat_urllib_parse_urlparse(url).query).get('t', [None])[0]),
+ }
diff --git a/youtube_dl/extractor/awaan.py b/youtube_dl/extractor/awaan.py
index a2603bbff..3a7700cd4 100644
--- a/youtube_dl/extractor/awaan.py
+++ b/youtube_dl/extractor/awaan.py
@@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor):
'duration': int_or_none(video_data.get('duration')),
'timestamp': parse_iso8601(video_data.get('create_time'), ' '),
'is_live': is_live,
+ 'uploader_id': video_data.get('user_id'),
}
@@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE):
'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'upload_date': '20150107',
'timestamp': 1420588800,
+ 'uploader_id': '71',
},
'params': {
# m3u8 download
diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py
index b1e20def5..930266990 100644
--- a/youtube_dl/extractor/azmedien.py
+++ b/youtube_dl/extractor/azmedien.py
@@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor):
'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1',
'only_matching': True
}]
- _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d'
+ _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be'
_PARTNER_ID = '1719221'
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bandaichannel.py b/youtube_dl/extractor/bandaichannel.py
new file mode 100644
index 000000000..d67285913
--- /dev/null
+++ b/youtube_dl/extractor/bandaichannel.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from ..utils import extract_attributes
+
+
+class BandaiChannelIE(BrightcoveNewIE):
+ IE_NAME = 'bandaichannel'
+ _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P<id>\d+/\d+)'
+ _TESTS = [{
+ 'url': 'https://www.b-ch.com/titles/514/001',
+ 'md5': 'a0f2d787baa5729bed71108257f613a4',
+ 'info_dict': {
+ 'id': '6128044564001',
+ 'ext': 'mp4',
+ 'title': 'メタルファイターMIKU 第1話',
+ 'timestamp': 1580354056,
+ 'uploader_id': '5797077852001',
+ 'upload_date': '20200130',
+ 'duration': 1387.733,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ attrs = extract_attributes(self._search_regex(
+ r'(<video-js[^>]+\bid="bcplayer"[^>]*>)', webpage, 'player'))
+ bc = self._download_json(
+ 'https://pbifcd.b-ch.com/v1/playbackinfo/ST/70/' + attrs['data-info'],
+ video_id, headers={'X-API-KEY': attrs['data-auth'].strip()})['bc']
+ return self._parse_brightcove_metadata(bc, bc['id'])
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 69e673a26..006aab3b4 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor):
'uploader': 'Ben Prunty',
'timestamp': 1396508491,
'upload_date': '20140403',
+ 'release_timestamp': 1396483200,
'release_date': '20140403',
'duration': 260.877,
'track': 'Lanius (Battle)',
@@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor):
'uploader': 'Mastodon',
'timestamp': 1322005399,
'upload_date': '20111122',
+ 'release_timestamp': 1076112000,
'release_date': '20040207',
'duration': 120.79,
'track': 'Hail to Fire',
@@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor):
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
- 'release_date': unified_strdate(tralbum.get('album_release_date')),
+ 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index b4daee54e..247d982ce 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -1,31 +1,39 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import itertools
+import json
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_etree_Element,
+ compat_HTTPError,
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
+)
from ..utils import (
+ ExtractorError,
+ OnDemandPagedList,
clean_html,
dict_get,
- ExtractorError,
float_or_none,
get_element_by_class,
int_or_none,
js_to_json,
parse_duration,
parse_iso8601,
+ strip_or_none,
try_get,
unescapeHTML,
+ unified_timestamp,
url_or_none,
urlencode_postdata,
urljoin,
)
-from ..compat import (
- compat_etree_Element,
- compat_HTTPError,
- compat_urlparse,
-)
class BBCCoUkIE(InfoExtractor):
@@ -756,8 +764,17 @@ class BBCIE(BBCCoUkIE):
'only_matching': True,
}, {
# custom redirection to www.bbc.com
+ # also, video with window.__INITIAL_DATA__
'url': 'http://www.bbc.co.uk/news/science-environment-33661876',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': 'p02xzws1',
+ 'ext': 'mp4',
+ 'title': "Pluto may have 'nitrogen glaciers'",
+ 'description': 'md5:6a95b593f528d7a5f2605221bc56912f',
+ 'thumbnail': r're:https?://.+/.+\.jpg',
+ 'timestamp': 1437785037,
+ 'upload_date': '20150725',
+ },
}, {
# single video article embedded with data-media-vpid
'url': 'http://www.bbc.co.uk/sport/rowing/35908187',
@@ -793,11 +810,25 @@ class BBCIE(BBCCoUkIE):
'description': 'Learn English words and phrases from this story',
},
'add_ie': [BBCCoUkIE.ie_key()],
+ }, {
+ # BBC Reel
+ 'url': 'https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness',
+ 'info_dict': {
+ 'id': 'p07c6sb9',
+ 'ext': 'mp4',
+ 'title': 'How positive thinking is harming your happiness',
+ 'alt_title': 'The downsides of positive thinking',
+ 'description': 'md5:fad74b31da60d83b8265954ee42d85b4',
+ 'duration': 235,
+ 'thumbnail': r're:https?://.+/p07c9dsr.jpg',
+ 'upload_date': '20190604',
+ 'categories': ['Psychology'],
+ },
}]
@classmethod
def suitable(cls, url):
- EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE)
+ EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerEpisodesIE, BBCCoUkIPlayerGroupIE, BBCCoUkPlaylistIE)
return (False if any(ie.suitable(url) for ie in EXCLUDE_IE)
else super(BBCIE, cls).suitable(url))
@@ -929,7 +960,7 @@ class BBCIE(BBCCoUkIE):
else:
entry['title'] = info['title']
entry['formats'].extend(info['formats'])
- except Exception as e:
+ except ExtractorError as e:
# Some playlist URL may fail with 500, at the same time
# the other one may work fine (e.g.
# http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
@@ -980,6 +1011,37 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles,
}
+ # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness)
+ initial_data = self._parse_json(self._html_search_regex(
+ r'<script[^>]+id=(["\'])initial-data\1[^>]+data-json=(["\'])(?P<json>(?:(?!\2).)+)',
+ webpage, 'initial data', default='{}', group='json'), playlist_id, fatal=False)
+ if initial_data:
+ init_data = try_get(
+ initial_data, lambda x: x['initData']['items'][0], dict) or {}
+ smp_data = init_data.get('smpData') or {}
+ clip_data = try_get(smp_data, lambda x: x['items'][0], dict) or {}
+ version_id = clip_data.get('versionID')
+ if version_id:
+ title = smp_data['title']
+ formats, subtitles = self._download_media_selector(version_id)
+ self._sort_formats(formats)
+ image_url = smp_data.get('holdingImageURL')
+ display_date = init_data.get('displayDate')
+ topic_title = init_data.get('topicTitle')
+
+ return {
+ 'id': version_id,
+ 'title': title,
+ 'formats': formats,
+ 'alt_title': init_data.get('shortTitle'),
+ 'thumbnail': image_url.replace('$recipe', 'raw') if image_url else None,
+ 'description': smp_data.get('summary') or init_data.get('shortSummary'),
+ 'upload_date': display_date.replace('-', '') if display_date else None,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(clip_data.get('duration')),
+ 'categories': [topic_title] if topic_title else None,
+ }
+
# Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
# There are several setPayload calls may be present but the video
# seems to be always related to the first one
@@ -1041,7 +1103,7 @@ class BBCIE(BBCCoUkIE):
thumbnail = None
image_url = current_programme.get('image_url')
if image_url:
- thumbnail = image_url.replace('{recipe}', '1920x1920')
+ thumbnail = image_url.replace('{recipe}', 'raw')
return {
'id': programme_id,
'title': title,
@@ -1114,12 +1176,29 @@ class BBCIE(BBCCoUkIE):
continue
formats, subtitles = self._download_media_selector(item_id)
self._sort_formats(formats)
+ item_desc = None
+ blocks = try_get(media, lambda x: x['summary']['blocks'], list)
+ if blocks:
+ summary = []
+ for block in blocks:
+ text = try_get(block, lambda x: x['model']['text'], compat_str)
+ if text:
+ summary.append(text)
+ if summary:
+ item_desc = '\n\n'.join(summary)
+ item_time = None
+ for meta in try_get(media, lambda x: x['metadata']['items'], list) or []:
+ if try_get(meta, lambda x: x['label']) == 'Published':
+ item_time = unified_timestamp(meta.get('timestamp'))
+ break
entries.append({
'id': item_id,
'title': item_title,
'thumbnail': item.get('holdingImageUrl'),
'formats': formats,
'subtitles': subtitles,
+ 'timestamp': item_time,
+ 'description': strip_or_none(item_desc),
})
for resp in (initial_data.get('data') or {}).values():
name = resp.get('name')
@@ -1293,21 +1372,149 @@ class BBCCoUkPlaylistBaseIE(InfoExtractor):
playlist_id, title, description)
-class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
- IE_NAME = 'bbc.co.uk:iplayer:playlist'
- _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
- _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
- _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
+class BBCCoUkIPlayerPlaylistBaseIE(InfoExtractor):
+ _VALID_URL_TMPL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/%%s/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+
+ @staticmethod
+ def _get_default(episode, key, default_key='default'):
+ return try_get(episode, lambda x: x[key][default_key])
+
+ def _get_description(self, data):
+ synopsis = data.get(self._DESCRIPTION_KEY) or {}
+ return dict_get(synopsis, ('large', 'medium', 'small'))
+
+ def _fetch_page(self, programme_id, per_page, series_id, page):
+ elements = self._get_elements(self._call_api(
+ programme_id, per_page, page + 1, series_id))
+ for element in elements:
+ episode = self._get_episode(element)
+ episode_id = episode.get('id')
+ if not episode_id:
+ continue
+ thumbnail = None
+ image = self._get_episode_image(episode)
+ if image:
+ thumbnail = image.replace('{recipe}', 'raw')
+ category = self._get_default(episode, 'labels', 'category')
+ yield {
+ '_type': 'url',
+ 'id': episode_id,
+ 'title': self._get_episode_field(episode, 'subtitle'),
+ 'url': 'https://www.bbc.co.uk/iplayer/episode/' + episode_id,
+ 'thumbnail': thumbnail,
+ 'description': self._get_description(episode),
+ 'categories': [category] if category else None,
+ 'series': self._get_episode_field(episode, 'title'),
+ 'ie_key': BBCCoUkIE.ie_key(),
+ }
+
+ def _real_extract(self, url):
+ pid = self._match_id(url)
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ series_id = qs.get('seriesId', [None])[0]
+ page = qs.get('page', [None])[0]
+ per_page = 36 if page else self._PAGE_SIZE
+ fetch_page = functools.partial(self._fetch_page, pid, per_page, series_id)
+ entries = fetch_page(int(page) - 1) if page else OnDemandPagedList(fetch_page, self._PAGE_SIZE)
+ playlist_data = self._get_playlist_data(self._call_api(pid, 1))
+ return self.playlist_result(
+ entries, pid, self._get_playlist_title(playlist_data),
+ self._get_description(playlist_data))
+
+
+class BBCCoUkIPlayerEpisodesIE(BBCCoUkIPlayerPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:episodes'
+ _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'episodes'
_TESTS = [{
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
'info_dict': {
'id': 'b05rcz9v',
'title': 'The Disappearance',
- 'description': 'French thriller serial about a missing teenager.',
+ 'description': 'md5:58eb101aee3116bad4da05f91179c0cb',
+ },
+ 'playlist_mincount': 8,
+ }, {
+ # all seasons
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
+ 'info_dict': {
+ 'id': 'b094m5t9',
+ 'title': 'Doctor Foster',
+ 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
+ },
+ 'playlist_mincount': 10,
+ }, {
+ # explicit season
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster?seriesId=b094m6nv',
+ 'info_dict': {
+ 'id': 'b094m5t9',
+ 'title': 'Doctor Foster',
+ 'description': 'md5:5aa9195fad900e8e14b52acd765a9fd6',
},
- 'playlist_mincount': 6,
- 'skip': 'This programme is not currently available on BBC iPlayer',
+ 'playlist_mincount': 5,
}, {
+ # all pages
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
+ 'info_dict': {
+ 'id': 'm0004c4v',
+ 'title': 'Beechgrove',
+ 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+ },
+ 'playlist_mincount': 37,
+ }, {
+ # explicit page
+ 'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove?page=2',
+ 'info_dict': {
+ 'id': 'm0004c4v',
+ 'title': 'Beechgrove',
+ 'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
+ },
+ 'playlist_mincount': 1,
+ }]
+ _PAGE_SIZE = 100
+ _DESCRIPTION_KEY = 'synopsis'
+
+ def _get_episode_image(self, episode):
+ return self._get_default(episode, 'image')
+
+ def _get_episode_field(self, episode, field):
+ return self._get_default(episode, field)
+
+ @staticmethod
+ def _get_elements(data):
+ return data['entities']['results']
+
+ @staticmethod
+ def _get_episode(element):
+ return element.get('episode') or {}
+
+ def _call_api(self, pid, per_page, page=1, series_id=None):
+ variables = {
+ 'id': pid,
+ 'page': page,
+ 'perPage': per_page,
+ }
+ if series_id:
+ variables['sliceId'] = series_id
+ return self._download_json(
+ 'https://graph.ibl.api.bbc.co.uk/', pid, headers={
+ 'Content-Type': 'application/json'
+ }, data=json.dumps({
+ 'id': '5692d93d5aac8d796a0305e895e61551',
+ 'variables': variables,
+ }).encode('utf-8'))['data']['programme']
+
+ @staticmethod
+ def _get_playlist_data(data):
+ return data
+
+ def _get_playlist_title(self, data):
+ return self._get_default(data, 'title')
+
+
+class BBCCoUkIPlayerGroupIE(BBCCoUkIPlayerPlaylistBaseIE):
+ IE_NAME = 'bbc.co.uk:iplayer:group'
+ _VALID_URL = BBCCoUkIPlayerPlaylistBaseIE._VALID_URL_TMPL % 'group'
+ _TESTS = [{
# Available for over a year unlike 30 days for most other programmes
'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
'info_dict': {
@@ -1316,14 +1523,56 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
},
'playlist_mincount': 10,
+ }, {
+ # all pages
+ 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7',
+ 'info_dict': {
+ 'id': 'p081d7j7',
+ 'title': 'Music in Scotland',
+ 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+ },
+ 'playlist_mincount': 47,
+ }, {
+ # explicit page
+ 'url': 'https://www.bbc.co.uk/iplayer/group/p081d7j7?page=2',
+ 'info_dict': {
+ 'id': 'p081d7j7',
+ 'title': 'Music in Scotland',
+ 'description': 'Perfomances in Scotland and programmes featuring Scottish acts.',
+ },
+ 'playlist_mincount': 11,
}]
+ _PAGE_SIZE = 200
+ _DESCRIPTION_KEY = 'synopses'
- def _extract_title_and_description(self, webpage):
- title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
- description = self._search_regex(
- r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
- webpage, 'description', fatal=False, group='value')
- return title, description
+ def _get_episode_image(self, episode):
+ return self._get_default(episode, 'images', 'standard')
+
+ def _get_episode_field(self, episode, field):
+ return episode.get(field)
+
+ @staticmethod
+ def _get_elements(data):
+ return data['elements']
+
+ @staticmethod
+ def _get_episode(element):
+ return element
+
+ def _call_api(self, pid, per_page, page=1, series_id=None):
+ return self._download_json(
+ 'http://ibl.api.bbc.co.uk/ibl/v1/groups/%s/episodes' % pid,
+ pid, query={
+ 'page': page,
+ 'per_page': per_page,
+ })['group_episodes']
+
+ @staticmethod
+ def _get_playlist_data(data):
+ return data['group']
+
+ def _get_playlist_title(self, data):
+ return data.get('title')
class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 4dc597e16..bff6ea194 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -156,6 +156,7 @@ class BiliBiliIE(InfoExtractor):
cid = js['result']['cid']
headers = {
+ 'Accept': 'application/json',
'Referer': url
}
headers.update(self.geo_verification_headers())
@@ -232,7 +233,7 @@ class BiliBiliIE(InfoExtractor):
webpage)
if uploader_mobj:
info.update({
- 'uploader': uploader_mobj.group('name'),
+ 'uploader': uploader_mobj.group('name').strip(),
'uploader_id': uploader_mobj.group('id'),
})
if not info.get('uploader'):
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index dc60224d0..d1bf8e829 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms',
- 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
+ 'md5': '670b2d73f48549da032861130488c681',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
+ 'upload_date': '20150723',
+ 'timestamp': 1437679032,
+
},
+ 'expected_warnings': [
+ 'Unable to download f4m manifest'
+ ]
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py
deleted file mode 100644
index db5e12b21..000000000
--- a/youtube_dl/extractor/blinkx.py
+++ /dev/null
@@ -1,86 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-
-from .common import InfoExtractor
-from ..utils import (
- remove_start,
- int_or_none,
-)
-
-
-class BlinkxIE(InfoExtractor):
- _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)'
- IE_NAME = 'blinkx'
-
- _TEST = {
- 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ',
- 'md5': '337cf7a344663ec79bf93a526a2e06c7',
- 'info_dict': {
- 'id': 'Da0Gw3xc',
- 'ext': 'mp4',
- 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News',
- 'uploader': 'IGN News',
- 'upload_date': '20150217',
- 'timestamp': 1424215740,
- 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.',
- 'duration': 47.743333,
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- display_id = video_id[:8]
-
- api_url = ('https://apib4.blinkx.com/api.php?action=play_video&'
- + 'video=%s' % video_id)
- data_json = self._download_webpage(api_url, display_id)
- data = json.loads(data_json)['api']['results'][0]
- duration = None
- thumbnails = []
- formats = []
- for m in data['media']:
- if m['type'] == 'jpg':
- thumbnails.append({
- 'url': m['link'],
- 'width': int(m['w']),
- 'height': int(m['h']),
- })
- elif m['type'] == 'original':
- duration = float(m['d'])
- elif m['type'] == 'youtube':
- yt_id = m['link']
- self.to_screen('Youtube video detected: %s' % yt_id)
- return self.url_result(yt_id, 'Youtube', video_id=yt_id)
- elif m['type'] in ('flv', 'mp4'):
- vcodec = remove_start(m['vcodec'], 'ff')
- acodec = remove_start(m['acodec'], 'ff')
- vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000)
- abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000)
- tbr = vbr + abr if vbr and abr else None
- format_id = '%s-%sk-%s' % (vcodec, tbr, m['w'])
- formats.append({
- 'format_id': format_id,
- 'url': m['link'],
- 'vcodec': vcodec,
- 'acodec': acodec,
- 'abr': abr,
- 'vbr': vbr,
- 'tbr': tbr,
- 'width': int_or_none(m.get('w')),
- 'height': int_or_none(m.get('h')),
- })
-
- self._sort_formats(formats)
-
- return {
- 'id': display_id,
- 'fullid': video_id,
- 'title': data['title'],
- 'formats': formats,
- 'uploader': data['channel_name'],
- 'timestamp': data['pubdate_epoch'],
- 'description': data.get('description'),
- 'thumbnails': thumbnails,
- 'duration': duration,
- }
diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py
index b9715df00..bae2aedce 100644
--- a/youtube_dl/extractor/bravotv.py
+++ b/youtube_dl/extractor/bravotv.py
@@ -12,7 +12,7 @@ from ..utils import (
class BravoTVIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is',
'md5': 'e34684cfea2a96cd2ee1ef3a60909de9',
@@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE):
}, {
'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1',
'only_matching': True,
+ }, {
+ 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ site, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
settings = self._parse_json(self._search_regex(
r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'),
@@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE):
tp_path = release_pid = tve['release_pid']
if tve.get('entitlement') == 'auth':
adobe_pass = settings.get('tve_adobe_auth', {})
+ if site == 'bravotv':
+ site = 'bravo'
resource = self._get_mvpd_resource(
- adobe_pass.get('adobePassResourceId', 'bravo'),
+ adobe_pass.get('adobePassResourceId') or site,
tve['title'], release_pid, tve.get('rating'))
query['auth'] = self._extract_mvpd_auth(
- url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource)
+ url, release_pid,
+ adobe_pass.get('adobePassRequestorId') or site, resource)
else:
shared_playlist = settings['ls_playlist']
account_pid = shared_playlist['account_pid']
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py
index 8b76a0200..eefbab241 100644
--- a/youtube_dl/extractor/canvas.py
+++ b/youtube_dl/extractor/canvas.py
@@ -7,19 +7,21 @@ from .common import InfoExtractor
from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError
from ..utils import (
- extract_attributes,
ExtractorError,
- strip_or_none,
+ clean_html,
+ extract_attributes,
float_or_none,
+ get_element_by_class,
int_or_none,
merge_dicts,
str_or_none,
+ strip_or_none,
url_or_none,
)
class CanvasIE(InfoExtractor):
- _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107',
@@ -332,3 +334,51 @@ class VrtNUIE(GigyaBaseIE):
'display_id': display_id,
'season_number': int_or_none(page.get('episode_season')),
})
+
+
+class DagelijkseKostIE(InfoExtractor):
+ IE_DESC = 'dagelijksekost.een.be'
+ _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
+ 'md5': '30bfffc323009a3e5f689bef6efa2365',
+ 'info_dict': {
+ 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
+ 'display_id': 'hachis-parmentier-met-witloof',
+ 'ext': 'mp4',
+ 'title': 'Hachis parmentier met witloof',
+ 'description': 'md5:9960478392d87f63567b5b117688cdc5',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 283.02,
+ },
+ 'expected_warnings': ['is not a supported codec'],
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ title = strip_or_none(get_element_by_class(
+ 'dish-metadata__title', webpage
+ ) or self._html_search_meta(
+ 'twitter:title', webpage))
+
+ description = clean_html(get_element_by_class(
+ 'dish-description', webpage)
+ ) or self._html_search_meta(
+ ('description', 'twitter:description', 'og:description'),
+ webpage)
+
+ video_id = self._html_search_regex(
+ r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
+ group='id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
+ 'ie_key': CanvasIE.ie_key(),
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ }
diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py
index 4a19a73d2..c79e55a75 100644
--- a/youtube_dl/extractor/cbs.py
+++ b/youtube_dl/extractor/cbs.py
@@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE):
class CBSIE(CBSBaseIE):
- _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
+ _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
_TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
@@ -52,6 +52,9 @@ class CBSIE(CBSBaseIE):
}, {
'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/',
+ 'only_matching': True,
}]
def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517):
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index 345debcf0..1285ed65e 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -26,7 +26,7 @@ class CBSNewsEmbedIE(CBSIE):
def _real_extract(self, url):
item = self._parse_json(zlib.decompress(compat_b64decode(
compat_urllib_parse_unquote(self._match_id(url))),
- -zlib.MAX_WBITS), None)['video']['items'][0]
+ -zlib.MAX_WBITS).decode('utf-8'), None)['video']['items'][0]
return self._extract_video_info(item['mpxRefId'], 'cbsnews')
diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py
index 83b764762..a891c9a55 100644
--- a/youtube_dl/extractor/cbssports.py
+++ b/youtube_dl/extractor/cbssports.py
@@ -1,38 +1,113 @@
from __future__ import unicode_literals
-from .cbs import CBSBaseIE
+import re
+# from .cbs import CBSBaseIE
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+)
-class CBSSportsIE(CBSBaseIE):
- _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/(?:video|news)/(?P<id>[^/?#&]+)'
+# class CBSSportsEmbedIE(CBSBaseIE):
+class CBSSportsEmbedIE(InfoExtractor):
+ IE_NAME = 'cbssports:embed'
+ _VALID_URL = r'''(?ix)https?://(?:(?:www\.)?cbs|embed\.247)sports\.com/player/embed.+?
+ (?:
+ ids%3D(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})|
+ pcid%3D(?P<pcid>\d+)
+ )'''
_TESTS = [{
- 'url': 'https://www.cbssports.com/nba/video/donovan-mitchell-flashes-star-potential-in-game-2-victory-over-thunder/',
- 'info_dict': {
- 'id': '1214315075735',
- 'ext': 'mp4',
- 'title': 'Donovan Mitchell flashes star potential in Game 2 victory over Thunder',
- 'description': 'md5:df6f48622612c2d6bd2e295ddef58def',
- 'timestamp': 1524111457,
- 'upload_date': '20180419',
- 'uploader': 'CBSI-NEW',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- }
+ 'url': 'https://www.cbssports.com/player/embed/?args=player_id%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26ids%3Db56c03a6-231a-4bbe-9c55-af3c8a8e9636%26resizable%3D1%26autoplay%3Dtrue%26domain%3Dcbssports.com%26comp_ads_enabled%3Dfalse%26watchAndRead%3D0%26startTime%3D0%26env%3Dprod',
+ 'only_matching': True,
}, {
- 'url': 'https://www.cbssports.com/nba/news/nba-playoffs-2018-watch-76ers-vs-heat-game-3-series-schedule-tv-channel-online-stream/',
+ 'url': 'https://embed.247sports.com/player/embed/?args=%3fplayer_id%3d1827823171591%26channel%3dcollege-football-recruiting%26pcid%3d1827823171591%26width%3d640%26height%3d360%26autoplay%3dTrue%26comp_ads_enabled%3dFalse%26uvpc%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_v4%2526partner%253d247%26uvpc_m%3dhttps%253a%252f%252fwww.cbssports.com%252fapi%252fcontent%252fvideo%252fconfig%252f%253fcfg%253duvp_247sports_m_v4%2526partner_m%253d247_mobile%26utag%3d247sportssite%26resizable%3dTrue',
'only_matching': True,
}]
- def _extract_video_info(self, filter_query, video_id):
- return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
+ # def _extract_video_info(self, filter_query, video_id):
+ # return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
def _real_extract(self, url):
+ uuid, pcid = re.match(self._VALID_URL, url).groups()
+ query = {'id': uuid} if uuid else {'pcid': pcid}
+ video = self._download_json(
+ 'https://www.cbssports.com/api/content/video/',
+ uuid or pcid, query=query)[0]
+ video_id = video['id']
+ title = video['title']
+ metadata = video.get('metaData') or {}
+ # return self._extract_video_info('byId=%d' % metadata['mpxOutletId'], video_id)
+ # return self._extract_video_info('byGuid=' + metadata['mpxRefId'], video_id)
+
+ formats = self._extract_m3u8_formats(
+ metadata['files'][0]['url'], video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+ self._sort_formats(formats)
+
+ image = video.get('image')
+ thumbnails = None
+ if image:
+ image_path = image.get('path')
+ if image_path:
+ thumbnails = [{
+ 'url': image_path,
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ 'filesize': int_or_none(image.get('size')),
+ }]
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': video.get('description'),
+ 'timestamp': int_or_none(try_get(video, lambda x: x['dateCreated']['epoch'])),
+ 'duration': int_or_none(metadata.get('duration')),
+ }
+
+
+class CBSSportsBaseIE(InfoExtractor):
+ def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- [r'(?:=|%26)pcid%3D(\d+)', r'embedVideo(?:Container)?_(\d+)'],
- webpage, 'video id')
- return self._extract_video_info('byId=%s' % video_id, video_id)
+ iframe_url = self._search_regex(
+ r'<iframe[^>]+(?:data-)?src="(https?://[^/]+/player/embed[^"]+)"',
+ webpage, 'embed url')
+ return self.url_result(iframe_url, CBSSportsEmbedIE.ie_key())
+
+
+class CBSSportsIE(CBSSportsBaseIE):
+ IE_NAME = 'cbssports'
+ _VALID_URL = r'https?://(?:www\.)?cbssports\.com/[^/]+/video/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.cbssports.com/college-football/video/cover-3-stanford-spring-gleaning/',
+ 'info_dict': {
+ 'id': 'b56c03a6-231a-4bbe-9c55-af3c8a8e9636',
+ 'ext': 'mp4',
+ 'title': 'Cover 3: Stanford Spring Gleaning',
+ 'description': 'The Cover 3 crew break down everything you need to know about the Stanford Cardinal this spring.',
+ 'timestamp': 1617218398,
+ 'upload_date': '20210331',
+ 'duration': 502,
+ },
+ }]
+
+
+class TwentyFourSevenSportsIE(CBSSportsBaseIE):
+ IE_NAME = '247sports'
+ _VALID_URL = r'https?://(?:www\.)?247sports\.com/Video/(?:[^/?#&]+-)?(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://247sports.com/Video/2021-QB-Jake-Garcia-senior-highlights-through-five-games-10084854/',
+ 'info_dict': {
+ 'id': '4f1265cb-c3b5-44a8-bb1d-1914119a0ccc',
+ 'ext': 'mp4',
+ 'title': '2021 QB Jake Garcia senior highlights through five games',
+ 'description': 'md5:8cb67ebed48e2e6adac1701e0ff6e45b',
+ 'timestamp': 1607114223,
+ 'upload_date': '20201204',
+ 'duration': 208,
+ },
+ }]
diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py
index 544647f92..e6ae49352 100644
--- a/youtube_dl/extractor/ccma.py
+++ b/youtube_dl/extractor/ccma.py
@@ -1,15 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
+import calendar
+import datetime
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
+ extract_timezone,
int_or_none,
parse_duration,
- parse_iso8601,
parse_resolution,
+ try_get,
url_or_none,
)
@@ -24,8 +27,9 @@ class CCMAIE(InfoExtractor):
'ext': 'mp4',
'title': 'L\'espot de La Marató de TV3',
'description': 'md5:f12987f320e2f6e988e9908e4fe97765',
- 'timestamp': 1470918540,
- 'upload_date': '20160811',
+ 'timestamp': 1478608140,
+ 'upload_date': '20161108',
+ 'age_limit': 0,
}
}, {
'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/',
@@ -35,8 +39,24 @@ class CCMAIE(InfoExtractor):
'ext': 'mp3',
'title': 'El Consell de Savis analitza el derbi',
'description': 'md5:e2a3648145f3241cb9c6b4b624033e53',
- 'upload_date': '20171205',
- 'timestamp': 1512507300,
+ 'upload_date': '20170512',
+ 'timestamp': 1494622500,
+ 'vcodec': 'none',
+ 'categories': ['Esports'],
+ }
+ }, {
+ 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/',
+ 'md5': 'b43c3d3486f430f3032b5b160d80cbc3',
+ 'info_dict': {
+ 'id': '6031387',
+ 'ext': 'mp4',
+ 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)',
+ 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60',
+ 'timestamp': 1582577700,
+ 'upload_date': '20200224',
+ 'subtitles': 'mincount:4',
+ 'age_limit': 16,
+ 'series': 'Crims',
}
}]
@@ -72,17 +92,28 @@ class CCMAIE(InfoExtractor):
informacio = media['informacio']
title = informacio['titol']
- durada = informacio.get('durada', {})
+ durada = informacio.get('durada') or {}
duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text'))
- timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc'))
+ tematica = try_get(informacio, lambda x: x['tematica']['text'])
+
+ timestamp = None
+ data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
+ try:
+ timezone, data_utc = extract_timezone(data_utc)
+ timestamp = calendar.timegm((datetime.datetime.strptime(
+ data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
+ except TypeError:
+ pass
subtitles = {}
- subtitols = media.get('subtitols', {})
- if subtitols:
- sub_url = subtitols.get('url')
+ subtitols = media.get('subtitols') or []
+ if isinstance(subtitols, dict):
+ subtitols = [subtitols]
+ for st in subtitols:
+ sub_url = st.get('url')
if sub_url:
subtitles.setdefault(
- subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({
+ st.get('iso') or st.get('text') or 'ca', []).append({
'url': sub_url,
})
@@ -97,6 +128,16 @@ class CCMAIE(InfoExtractor):
'height': int_or_none(imatges.get('alcada')),
}]
+ age_limit = None
+ codi_etic = try_get(informacio, lambda x: x['codi_etic']['id'])
+ if codi_etic:
+ codi_etic_s = codi_etic.split('_')
+ if len(codi_etic_s) == 2:
+ if codi_etic_s[1] == 'TP':
+ age_limit = 0
+ else:
+ age_limit = int_or_none(codi_etic_s[1])
+
return {
'id': media_id,
'title': title,
@@ -106,4 +147,9 @@ class CCMAIE(InfoExtractor):
'thumbnails': thumbnails,
'subtitles': subtitles,
'formats': formats,
+ 'age_limit': age_limit,
+ 'alt_title': informacio.get('titol_complet'),
+ 'episode_number': int_or_none(informacio.get('capitol')),
+ 'categories': [tematica] if tematica else None,
+ 'series': informacio.get('programa'),
}
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index d67900e62..e1b391937 100644
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -95,8 +95,11 @@ class CDAIE(InfoExtractor):
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True)
+ if re.search(r'niedostępn[ey] w(?:&nbsp;|\s+)Twoim kraju\s*<', webpage):
+ self.raise_geo_restricted()
+
need_confirm_age = False
- if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+ if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
webpage, 'birthday validate form', default=None):
webpage = self._download_age_confirm_page(
url, video_id, note='Confirming age')
@@ -130,6 +133,8 @@ class CDAIE(InfoExtractor):
'age_limit': 18 if need_confirm_age else 0,
}
+ info = self._search_json_ld(webpage, video_id, default={})
+
# Source: https://www.cda.pl/js/player.js?t=1606154898
def decrypt_file(a):
for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
@@ -194,7 +199,7 @@ class CDAIE(InfoExtractor):
handler = self._download_webpage
webpage = handler(
- self._BASE_URL + href, video_id,
+ urljoin(self._BASE_URL, href), video_id,
'Downloading %s version information' % resolution, fatal=False)
if not webpage:
# Manually report warning because empty page is returned when
@@ -206,6 +211,4 @@ class CDAIE(InfoExtractor):
self._sort_formats(formats)
- info = self._search_json_ld(webpage, video_id, default={})
-
return merge_dicts(info_dict, info)
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py
index d08b909a6..1bfa912be 100644
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -1,142 +1,51 @@
from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor
-from .common import InfoExtractor
class ComedyCentralIE(MTVServicesInfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
- (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes)))
- /(?P<title>.*)'''
+ _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})'
_FEED_URL = 'http://comedycentral.com/feeds/mrss/'
_TESTS = [{
- 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
- 'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
+ 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike',
+ 'md5': 'b8acb347177c680ff18a292aa2166f80',
'info_dict': {
- 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+ 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025',
'ext': 'mp4',
- 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother',
- 'description': 'After a certain point, breastfeeding becomes c**kblocking.',
- 'timestamp': 1376798400,
- 'upload_date': '20130818',
+ 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike',
+ 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498',
+ 'timestamp': 1598670000,
+ 'upload_date': '20200829',
},
}, {
- 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
+ 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314',
'only_matching': True,
- }]
-
-
-class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor):
- _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/
- (?:full-episodes|shows(?=/[^/]+/full-episodes))
- /(?P<id>[^?]+)'''
- _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
-
- _TESTS = [{
- 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028',
- 'info_dict': {
- 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."',
- 'title': 'November 28, 2016 - Ryan Speedo Green',
- },
- 'playlist_count': 4,
}, {
- 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
- mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1')
- videos_info = self._get_videos_info(mgid)
- return videos_info
-
-
-class ToshIE(MTVServicesInfoExtractor):
- IE_DESC = 'Tosh.0'
- _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)'
- _FEED_URL = 'http://tosh.cc.com/feeds/mrss'
-
- _TESTS = [{
- 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
- 'info_dict': {
- 'description': 'Tosh asked fans to share their summer plans.',
- 'title': 'Twitter Users Share Summer Plans',
- },
- 'playlist': [{
- 'md5': 'f269e88114c1805bb6d7653fecea9e06',
- 'info_dict': {
- 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30',
- 'ext': 'mp4',
- 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans',
- 'description': 'Tosh asked fans to share their summer plans.',
- 'thumbnail': r're:^https?://.*\.jpg',
- # It's really reported to be published on year 2077
- 'upload_date': '20770610',
- 'timestamp': 3390510600,
- 'subtitles': {
- 'en': 'mincount:3',
- },
- },
- }]
- }, {
- 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp',
+ 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate',
'only_matching': True,
}]
class ComedyCentralTVIE(MTVServicesInfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})'
_TESTS = [{
- 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4',
+ 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1',
'info_dict': {
- 'id': 'local_playlist-f99b626bdfe13568579a',
- 'ext': 'flv',
- 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
+ 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285',
+ 'ext': 'mp4',
+ 'title': 'Josh Investigates',
+ 'description': 'Steht uns das Ende der Welt bevor?',
},
- }, {
- 'url': 'http://www.comedycentral.tv/shows/1074-workaholics',
- 'only_matching': True,
- }, {
- 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus',
- 'only_matching': True,
}]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- mrss_url = self._search_regex(
- r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1',
- webpage, 'mrss url', group='url')
-
- return self._get_videos_info_from_url(mrss_url, video_id)
-
-
-class ComedyCentralShortnameIE(InfoExtractor):
- _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$'
- _TESTS = [{
- 'url': ':tds',
- 'only_matching': True,
- }, {
- 'url': ':thedailyshow',
- 'only_matching': True,
- }, {
- 'url': ':theopposition',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- shortcut_map = {
- 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
- 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
- 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes',
+ _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
+ _GEO_COUNTRIES = ['DE']
+
+ def _get_feed_query(self, uri):
+ return {
+ 'accountOverride': 'intl.mtvi.com',
+ 'arcEp': 'web.cc.tv',
+ 'ep': 'b9032c3a',
+ 'imageEp': 'web.cc.tv',
+ 'mgid': uri,
}
- return self.url_result(shortcut_map[video_id])
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index d5faa0eb7..797c35fd5 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -17,7 +17,7 @@ import math
from ..compat import (
compat_cookiejar_Cookie,
- compat_cookies,
+ compat_cookies_SimpleCookie,
compat_etree_Element,
compat_etree_fromstring,
compat_getpass,
@@ -230,8 +230,10 @@ class InfoExtractor(object):
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
+ release_timestamp: UNIX timestamp of the moment the video was released.
release_date: The date (YYYYMMDD) when the video was released.
- timestamp: UNIX timestamp of the moment the video became available.
+ timestamp: UNIX timestamp of the moment the video became available
+ (uploaded).
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
@@ -1273,6 +1275,7 @@ class InfoExtractor(object):
def extract_video_object(e):
assert e['@type'] == 'VideoObject'
+ author = e.get('author')
info.update({
'url': url_or_none(e.get('contentUrl')),
'title': unescapeHTML(e.get('name')),
@@ -1280,7 +1283,11 @@ class InfoExtractor(object):
'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
'duration': parse_duration(e.get('duration')),
'timestamp': unified_timestamp(e.get('uploadDate')),
- 'uploader': str_or_none(e.get('author')),
+ # author can be an instance of 'Organization' or 'Person' types.
+ # both types can have 'name' property(inherited from 'Thing' type). [1]
+ # however some websites are using 'Text' type instead.
+ # 1. https://schema.org/VideoObject
+ 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
'filesize': float_or_none(e.get('contentSize')),
'tbr': int_or_none(e.get('bitrate')),
'width': int_or_none(e.get('width')),
@@ -2064,7 +2071,7 @@ class InfoExtractor(object):
})
return entries
- def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
+ def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
@@ -2078,10 +2085,9 @@ class InfoExtractor(object):
mpd_base_url = base_url(urlh.geturl())
return self._parse_mpd_formats(
- mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
- formats_dict=formats_dict, mpd_url=mpd_url)
+ mpd_doc, mpd_id, mpd_base_url, mpd_url)
- def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
+ def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@@ -2359,15 +2365,7 @@ class InfoExtractor(object):
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
-
- # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
- # is not necessarily unique within a Period thus formats with
- # the same `format_id` are quite possible. There are numerous examples
- # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
- # https://github.com/ytdl-org/youtube-dl/issues/13919)
- full_info = formats_dict.get(representation_id, {}).copy()
- full_info.update(f)
- formats.append(full_info)
+ formats.append(f)
else:
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
@@ -2903,10 +2901,10 @@ class InfoExtractor(object):
self._downloader.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):
- """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ """ Return a compat_cookies_SimpleCookie with the cookies for the url """
req = sanitized_Request(url)
self._downloader.cookiejar.add_cookie_header(req)
- return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+ return compat_cookies_SimpleCookie(req.get_header('Cookie'))
def _apply_first_set_cookie_header(self, url_handle, cookie):
"""
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index 766942146..2e01aff48 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -8,11 +8,14 @@ from ..utils import (
ExtractorError,
extract_attributes,
find_xpath_attr,
+ get_element_by_attribute,
get_element_by_class,
int_or_none,
js_to_json,
merge_dicts,
+ parse_iso8601,
smuggle_url,
+ str_to_int,
unescapeHTML,
)
from .senateisvp import SenateISVPIE
@@ -116,8 +119,30 @@ class CSpanIE(InfoExtractor):
jwsetup, video_id, require_title=False, m3u8_id='hls',
base_url=url)
add_referer(info['formats'])
+ for subtitles in info['subtitles'].values():
+ for subtitle in subtitles:
+ ext = determine_ext(subtitle['url'])
+ if ext == 'php':
+ ext = 'vtt'
+ subtitle['ext'] = ext
ld_info = self._search_json_ld(webpage, video_id, default={})
- return merge_dicts(info, ld_info)
+ title = get_element_by_class('video-page-title', webpage) or \
+ self._og_search_title(webpage)
+ description = get_element_by_attribute('itemprop', 'description', webpage) or \
+ self._html_search_meta(['og:description', 'description'], webpage)
+ return merge_dicts(info, ld_info, {
+ 'title': title,
+ 'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage),
+ 'description': description,
+ 'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)),
+ 'location': get_element_by_attribute('itemprop', 'contentLocation', webpage),
+ 'duration': int_or_none(self._search_regex(
+ r'jwsetup\.seclength\s*=\s*(\d+);',
+ webpage, 'duration', fatal=False)),
+ 'view_count': str_to_int(self._search_regex(
+ r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>",
+ webpage, 'views', fatal=False)),
+ })
# Obsolete
# We first look for clipid, because clipprog always appears before
diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py
index e4a7fca6c..48ff30432 100644
--- a/youtube_dl/extractor/curiositystream.py
+++ b/youtube_dl/extractor/curiositystream.py
@@ -25,12 +25,12 @@ class CuriosityStreamBaseIE(InfoExtractor):
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
- def _call_api(self, path, video_id):
+ def _call_api(self, path, video_id, query=None):
headers = {}
if self._auth_token:
headers['X-Auth-Token'] = self._auth_token
result = self._download_json(
- self._API_BASE_URL + path, video_id, headers=headers)
+ self._API_BASE_URL + path, video_id, headers=headers, query=query)
self._handle_errors(result)
return result['data']
@@ -52,62 +52,75 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
_VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)'
_TEST = {
'url': 'https://app.curiositystream.com/video/2',
- 'md5': '262bb2f257ff301115f1973540de8983',
'info_dict': {
'id': '2',
'ext': 'mp4',
'title': 'How Did You Develop The Internet?',
'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
- }
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ # m3u8 download
+ 'skip_download': True,
+ },
}
def _real_extract(self, url):
video_id = self._match_id(url)
- media = self._call_api('media/' + video_id, video_id)
- title = media['title']
formats = []
- for encoding in media.get('encodings', []):
- m3u8_url = encoding.get('master_playlist_url')
- if m3u8_url:
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- encoding_url = encoding.get('url')
- file_url = encoding.get('file_url')
- if not encoding_url and not file_url:
- continue
- f = {
- 'width': int_or_none(encoding.get('width')),
- 'height': int_or_none(encoding.get('height')),
- 'vbr': int_or_none(encoding.get('video_bitrate')),
- 'abr': int_or_none(encoding.get('audio_bitrate')),
- 'filesize': int_or_none(encoding.get('size_in_bytes')),
- 'vcodec': encoding.get('video_codec'),
- 'acodec': encoding.get('audio_codec'),
- 'container': encoding.get('container_type'),
- }
- for f_url in (encoding_url, file_url):
- if not f_url:
+ for encoding_format in ('m3u8', 'mpd'):
+ media = self._call_api('media/' + video_id, video_id, query={
+ 'encodingsNew': 'true',
+ 'encodingsFormat': encoding_format,
+ })
+ for encoding in media.get('encodings', []):
+ playlist_url = encoding.get('master_playlist_url')
+ if encoding_format == 'm3u8':
+ # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol
+ formats.extend(self._extract_m3u8_formats(
+ playlist_url, video_id, 'mp4',
+ m3u8_id='hls', fatal=False))
+ elif encoding_format == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ playlist_url, video_id, mpd_id='dash', fatal=False))
+ encoding_url = encoding.get('url')
+ file_url = encoding.get('file_url')
+ if not encoding_url and not file_url:
continue
- fmt = f.copy()
- rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
- if rtmp:
- fmt.update({
- 'url': rtmp.group('url'),
- 'play_path': rtmp.group('playpath'),
- 'app': rtmp.group('app'),
- 'ext': 'flv',
- 'format_id': 'rtmp',
- })
- else:
- fmt.update({
- 'url': f_url,
- 'format_id': 'http',
- })
- formats.append(fmt)
+ f = {
+ 'width': int_or_none(encoding.get('width')),
+ 'height': int_or_none(encoding.get('height')),
+ 'vbr': int_or_none(encoding.get('video_bitrate')),
+ 'abr': int_or_none(encoding.get('audio_bitrate')),
+ 'filesize': int_or_none(encoding.get('size_in_bytes')),
+ 'vcodec': encoding.get('video_codec'),
+ 'acodec': encoding.get('audio_codec'),
+ 'container': encoding.get('container_type'),
+ }
+ for f_url in (encoding_url, file_url):
+ if not f_url:
+ continue
+ fmt = f.copy()
+ rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
+ if rtmp:
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': 'rtmp',
+ })
+ else:
+ fmt.update({
+ 'url': f_url,
+ 'format_id': 'http',
+ })
+ formats.append(fmt)
self._sort_formats(formats)
+ title = media['title']
+
subtitles = {}
for closed_caption in media.get('closed_captions', []):
sub_url = closed_caption.get('file')
@@ -132,7 +145,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
IE_NAME = 'curiositystream:collection'
- _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://app.curiositystream.com/collection/2',
'info_dict': {
@@ -140,10 +153,13 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
'title': 'Curious Minds: The Internet',
'description': 'How is the internet shaping our lives in the 21st Century?',
},
- 'playlist_mincount': 17,
+ 'playlist_mincount': 16,
}, {
'url': 'https://curiositystream.com/series/2',
'only_matching': True,
+ }, {
+ 'url': 'https://curiositystream.com/collections/36',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py
index c345e0274..276fd4b09 100644
--- a/youtube_dl/extractor/dispeak.py
+++ b/youtube_dl/extractor/dispeak.py
@@ -32,6 +32,18 @@ class DigitallySpeakingIE(InfoExtractor):
# From http://www.gdcvault.com/play/1013700/Advanced-Material
'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
'only_matching': True,
+ }, {
+ # From https://gdcvault.com/play/1016624, empty speakerVideo
+ 'url': 'https://sevt.dispeak.com/ubm/gdc/online12/xml/201210-822101_1349794556671DDDD.xml',
+ 'info_dict': {
+ 'id': '201210-822101_1349794556671DDDD',
+ 'ext': 'flv',
+ 'title': 'Pre-launch - Preparing to Take the Plunge',
+ },
+ }, {
+ # From http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru, empty slideVideo
+ 'url': 'http://events.digitallyspeaking.com/gdc/project25/xml/p25-miyamoto1999_1282467389849HSVB.xml',
+ 'only_matching': True,
}]
def _parse_mp4(self, metadata):
@@ -84,26 +96,20 @@ class DigitallySpeakingIE(InfoExtractor):
'vcodec': 'none',
'format_id': audio.get('code'),
})
- slide_video_path = xpath_text(metadata, './slideVideo', fatal=True)
- formats.append({
- 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
- 'play_path': remove_end(slide_video_path, '.flv'),
- 'ext': 'flv',
- 'format_note': 'slide deck video',
- 'quality': -2,
- 'preference': -2,
- 'format_id': 'slides',
- })
- speaker_video_path = xpath_text(metadata, './speakerVideo', fatal=True)
- formats.append({
- 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
- 'play_path': remove_end(speaker_video_path, '.flv'),
- 'ext': 'flv',
- 'format_note': 'speaker video',
- 'quality': -1,
- 'preference': -1,
- 'format_id': 'speaker',
- })
+ for video_key, format_id, preference in (
+ ('slide', 'slides', -2), ('speaker', 'speaker', -1)):
+ video_path = xpath_text(metadata, './%sVideo' % video_key)
+ if not video_path:
+ continue
+ formats.append({
+ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url,
+ 'play_path': remove_end(video_path, '.flv'),
+ 'ext': 'flv',
+ 'format_note': '%s video' % video_key,
+ 'quality': preference,
+ 'preference': preference,
+ 'format_id': format_id,
+ })
return formats
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 47501dbe6..bbb199094 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
@@ -10,11 +11,13 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ strip_or_none,
unified_timestamp,
)
class DPlayIE(InfoExtractor):
+ _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)'
_VALID_URL = r'''(?x)https?://
(?P<domain>
(?:www\.)?(?P<host>d
@@ -24,7 +27,7 @@ class DPlayIE(InfoExtractor):
)
)|
(?P<subdomain_country>es|it)\.dplay\.com
- )/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
+ )/[^/]+''' + _PATH_REGEX
_TESTS = [{
# non geo restricted, via secure api, unsigned download hls URL
@@ -151,56 +154,79 @@ class DPlayIE(InfoExtractor):
'only_matching': True,
}]
+ def _process_errors(self, e, geo_countries):
+ info = self._parse_json(e.cause.read().decode('utf-8'), None)
+ error = info['errors'][0]
+ error_code = error.get('code')
+ if error_code == 'access.denied.geoblocked':
+ self.raise_geo_restricted(countries=geo_countries)
+ elif error_code in ('access.denied.missingpackage', 'invalid.token'):
+ raise ExtractorError(
+ 'This video is only available for registered users. You may want to use --cookies.', expected=True)
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers['Authorization'] = 'Bearer ' + self._download_json(
+ disco_base + 'token', display_id, 'Downloading token',
+ query={
+ 'realm': realm,
+ })['data']['attributes']['token']
+
+ def _download_video_playback_info(self, disco_base, video_id, headers):
+ streaming = self._download_json(
+ disco_base + 'playback/videoPlaybackInfo/' + video_id,
+ video_id, headers=headers)['data']['attributes']['streaming']
+ streaming_list = []
+ for format_id, format_dict in streaming.items():
+ streaming_list.append({
+ 'type': format_id,
+ 'url': format_dict.get('url'),
+ })
+ return streaming_list
+
def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
geo_countries = [country.upper()]
self._initialize_geo_bypass({
'countries': geo_countries,
})
disco_base = 'https://%s/' % disco_host
- token = self._download_json(
- disco_base + 'token', display_id, 'Downloading token',
- query={
- 'realm': realm,
- })['data']['attributes']['token']
headers = {
'Referer': url,
- 'Authorization': 'Bearer ' + token,
}
- video = self._download_json(
- disco_base + 'content/videos/' + display_id, display_id,
- headers=headers, query={
- 'fields[channel]': 'name',
- 'fields[image]': 'height,src,width',
- 'fields[show]': 'name',
- 'fields[tag]': 'name',
- 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
- 'include': 'images,primaryChannel,show,tags'
- })
+ self._update_disco_api_headers(headers, disco_base, display_id, realm)
+ try:
+ video = self._download_json(
+ disco_base + 'content/videos/' + display_id, display_id,
+ headers=headers, query={
+ 'fields[channel]': 'name',
+ 'fields[image]': 'height,src,width',
+ 'fields[show]': 'name',
+ 'fields[tag]': 'name',
+ 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
+ 'include': 'images,primaryChannel,show,tags'
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ self._process_errors(e, geo_countries)
+ raise
video_id = video['data']['id']
info = video['data']['attributes']
title = info['name'].strip()
formats = []
try:
- streaming = self._download_json(
- disco_base + 'playback/videoPlaybackInfo/' + video_id,
- display_id, headers=headers)['data']['attributes']['streaming']
+ streaming = self._download_video_playback_info(
+ disco_base, video_id, headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
- error = info['errors'][0]
- error_code = error.get('code')
- if error_code == 'access.denied.geoblocked':
- self.raise_geo_restricted(countries=geo_countries)
- elif error_code == 'access.denied.missingpackage':
- self.raise_login_required()
- raise ExtractorError(info['errors'][0]['detail'], expected=True)
+ self._process_errors(e, geo_countries)
raise
- for format_id, format_dict in streaming.items():
+ for format_dict in streaming:
if not isinstance(format_dict, dict):
continue
format_url = format_dict.get('url')
if not format_url:
continue
+ format_id = format_dict.get('type')
ext = determine_ext(format_url)
if format_id == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
@@ -248,7 +274,7 @@ class DPlayIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'title': title,
- 'description': info.get('description'),
+ 'description': strip_or_none(info.get('description')),
'duration': float_or_none(info.get('videoDuration'), 1000),
'timestamp': unified_timestamp(info.get('publishStart')),
'series': series,
@@ -268,3 +294,76 @@ class DPlayIE(InfoExtractor):
host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
return self._get_disco_api_info(
url, display_id, host, 'dplay' + country, country)
+
+
+class DiscoveryPlusIE(DPlayIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
+ 'info_dict': {
+ 'id': '1140794',
+ 'display_id': 'property-brothers-forever-home/food-and-family',
+ 'ext': 'mp4',
+ 'title': 'Food and Family',
+ 'description': 'The brothers help a Richmond family expand their single-level home.',
+ 'duration': 2583.113,
+ 'timestamp': 1609304400,
+ 'upload_date': '20201230',
+ 'creator': 'HGTV',
+ 'series': 'Property Brothers: Forever Home',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }]
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0'
+
+ def _download_video_playback_info(self, disco_base, video_id, headers):
+ return self._download_json(
+ disco_base + 'playback/v3/videoPlaybackInfo',
+ video_id, headers=headers, data=json.dumps({
+ 'deviceInfo': {
+ 'adBlocker': False,
+ },
+ 'videoId': video_id,
+ 'wisteriaProperties': {
+ 'platform': 'desktop',
+ 'product': 'dplus_us',
+ },
+ }).encode('utf-8'))['data']['attributes']['streaming']
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ return self._get_disco_api_info(
+ url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us')
+
+
+class HGTVDeIE(DPlayIE):
+ _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
+ 'info_dict': {
+ 'id': '151205',
+ 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
+ 'ext': 'mp4',
+ 'title': 'Wer braucht schon eine Toilette',
+ 'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
+ 'duration': 1177.024,
+ 'timestamp': 1595705400,
+ 'upload_date': '20200725',
+ 'creator': 'HGTV',
+ 'series': 'Tiny House - klein, aber oho',
+ 'season_number': 3,
+ 'episode_number': 3,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ return self._get_disco_api_info(
+ url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py
index 848d387d1..5a07c18f4 100644
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -1,193 +1,43 @@
from __future__ import unicode_literals
-import re
+from .zdf import ZDFIE
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- unified_strdate,
- xpath_text,
- determine_ext,
- float_or_none,
- ExtractorError,
-)
-
-class DreiSatIE(InfoExtractor):
+class DreiSatIE(ZDFIE):
IE_NAME = '3sat'
- _GEO_COUNTRIES = ['DE']
- _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)'
- _TESTS = [
- {
- 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
- 'md5': 'be37228896d30a88f315b638900a026e',
- 'info_dict': {
- 'id': '45918',
- 'ext': 'mp4',
- 'title': 'Waidmannsheil',
- 'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
- 'uploader': 'SCHWEIZWEIT',
- 'uploader_id': '100000210',
- 'upload_date': '20140913'
- },
- 'params': {
- 'skip_download': True, # m3u8 downloads
- }
+ _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
+ _TESTS = [{
+ # Same as https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html
+ 'url': 'https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html',
+ 'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
+ 'info_dict': {
+ 'id': '141007_ab18_10wochensommer_film',
+ 'ext': 'mp4',
+ 'title': 'Ab 18! - 10 Wochen Sommer',
+ 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
+ 'duration': 2660,
+ 'timestamp': 1608604200,
+ 'upload_date': '20201222',
},
- {
- 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
- 'only_matching': True,
+ }, {
+ 'url': 'https://www.3sat.de/gesellschaft/schweizweit/waidmannsheil-100.html',
+ 'info_dict': {
+ 'id': '140913_sendung_schweizweit',
+ 'ext': 'mp4',
+ 'title': 'Waidmannsheil',
+ 'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
+ 'timestamp': 1410623100,
+ 'upload_date': '20140913'
},
- ]
-
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
- param_groups = {}
- for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
- group_id = param_group.get(self._xpath_ns(
- 'id', 'http://www.w3.org/XML/1998/namespace'))
- params = {}
- for param in param_group:
- params[param.get('name')] = param.get('value')
- param_groups[group_id] = params
-
- formats = []
- for video in smil.findall(self._xpath_ns('.//video', namespace)):
- src = video.get('src')
- if not src:
- continue
- bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
- group_id = video.get('paramGroup')
- param_group = param_groups[group_id]
- for proto in param_group['protocols'].split(','):
- formats.append({
- 'url': '%s://%s' % (proto, param_group['host']),
- 'app': param_group['app'],
- 'play_path': src,
- 'ext': 'flv',
- 'format_id': '%s-%d' % (proto, bitrate),
- 'tbr': bitrate,
- })
- self._sort_formats(formats)
- return formats
-
- def extract_from_xml_url(self, video_id, xml_url):
- doc = self._download_xml(
- xml_url, video_id,
- note='Downloading video info',
- errnote='Failed to download video info')
-
- status_code = xpath_text(doc, './status/statuscode')
- if status_code and status_code != 'ok':
- if status_code == 'notVisibleAnymore':
- message = 'Video %s is not available' % video_id
- else:
- message = '%s returned error: %s' % (self.IE_NAME, status_code)
- raise ExtractorError(message, expected=True)
-
- title = xpath_text(doc, './/information/title', 'title', True)
-
- urls = []
- formats = []
- for fnode in doc.findall('.//formitaeten/formitaet'):
- video_url = xpath_text(fnode, 'url')
- if not video_url or video_url in urls:
- continue
- urls.append(video_url)
-
- is_available = 'http://www.metafilegenerator' not in video_url
- geoloced = 'static_geoloced_online' in video_url
- if not is_available or geoloced:
- continue
-
- format_id = fnode.attrib['basetype']
- format_m = re.match(r'''(?x)
- (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
- (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
- ''', format_id)
-
- ext = determine_ext(video_url, None) or format_m.group('container')
-
- if ext == 'meta':
- continue
- elif ext == 'smil':
- formats.extend(self._extract_smil_formats(
- video_url, video_id, fatal=False))
- elif ext == 'm3u8':
- # the certificates are misconfigured (see
- # https://github.com/ytdl-org/youtube-dl/issues/8665)
- if video_url.startswith('https://'):
- continue
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id, fatal=False))
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- video_url, video_id, f4m_id=format_id, fatal=False))
- else:
- quality = xpath_text(fnode, './quality')
- if quality:
- format_id += '-' + quality
-
- abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000)
- vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000)
-
- tbr = int_or_none(self._search_regex(
- r'_(\d+)k', video_url, 'bitrate', None))
- if tbr and vbr and not abr:
- abr = tbr - vbr
-
- formats.append({
- 'format_id': format_id,
- 'url': video_url,
- 'ext': ext,
- 'acodec': format_m.group('acodec'),
- 'vcodec': format_m.group('vcodec'),
- 'abr': abr,
- 'vbr': vbr,
- 'tbr': tbr,
- 'width': int_or_none(xpath_text(fnode, './width')),
- 'height': int_or_none(xpath_text(fnode, './height')),
- 'filesize': int_or_none(xpath_text(fnode, './filesize')),
- 'protocol': format_m.group('proto').lower(),
- })
-
- geolocation = xpath_text(doc, './/details/geolocation')
- if not formats and geolocation and geolocation != 'none':
- self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
-
- self._sort_formats(formats)
-
- thumbnails = []
- for node in doc.findall('.//teaserimages/teaserimage'):
- thumbnail_url = node.text
- if not thumbnail_url:
- continue
- thumbnail = {
- 'url': thumbnail_url,
- }
- thumbnail_key = node.get('key')
- if thumbnail_key:
- m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
- if m:
- thumbnail['width'] = int(m.group(1))
- thumbnail['height'] = int(m.group(2))
- thumbnails.append(thumbnail)
-
- upload_date = unified_strdate(xpath_text(doc, './/details/airtime'))
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': xpath_text(doc, './/information/detail'),
- 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')),
- 'thumbnails': thumbnails,
- 'uploader': xpath_text(doc, './/details/originChannelTitle'),
- 'uploader_id': xpath_text(doc, './/details/originChannelId'),
- 'upload_date': upload_date,
- 'formats': formats,
+ 'params': {
+ 'skip_download': True,
}
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id
- return self.extract_from_xml_url(video_id, details_url)
+ }, {
+ # Same as https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html
+ 'url': 'https://www.3sat.de/film/spielfilm/der-hauptmann-100.html',
+ 'only_matching': True,
+ }, {
+ # Same as https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids
+ 'url': 'https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py
index df11dc206..9bbd703e0 100644
--- a/youtube_dl/extractor/egghead.py
+++ b/youtube_dl/extractor/egghead.py
@@ -12,26 +12,35 @@ from ..utils import (
)
-class EggheadCourseIE(InfoExtractor):
+class EggheadBaseIE(InfoExtractor):
+ def _call_api(self, path, video_id, resource, fatal=True):
+ return self._download_json(
+ 'https://app.egghead.io/api/v1/' + path,
+ video_id, 'Downloading %s JSON' % resource, fatal=fatal)
+
+
+class EggheadCourseIE(EggheadBaseIE):
IE_DESC = 'egghead.io course'
IE_NAME = 'egghead:course'
- _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
- _TEST = {
+ _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P<id>[^/?#&]+)'
+ _TESTS = [{
'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
'playlist_count': 29,
'info_dict': {
- 'id': '72',
+ 'id': '432655',
'title': 'Professor Frisby Introduces Composable Functional JavaScript',
'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$',
},
- }
+ }, {
+ 'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
playlist_id = self._match_id(url)
-
- lessons = self._download_json(
- 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id,
- playlist_id, 'Downloading course lessons JSON')
+ series_path = 'series/' + playlist_id
+ lessons = self._call_api(
+ series_path + '/lessons', playlist_id, 'course lessons')
entries = []
for lesson in lessons:
@@ -44,9 +53,8 @@ class EggheadCourseIE(InfoExtractor):
entries.append(self.url_result(
lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id))
- course = self._download_json(
- 'https://egghead.io/api/v1/series/%s' % playlist_id,
- playlist_id, 'Downloading course JSON', fatal=False) or {}
+ course = self._call_api(
+ series_path, playlist_id, 'course', False) or {}
playlist_id = course.get('id')
if playlist_id:
@@ -57,10 +65,10 @@ class EggheadCourseIE(InfoExtractor):
course.get('description'))
-class EggheadLessonIE(InfoExtractor):
+class EggheadLessonIE(EggheadBaseIE):
IE_DESC = 'egghead.io lesson'
IE_NAME = 'egghead:lesson'
- _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
'info_dict': {
@@ -74,7 +82,7 @@ class EggheadLessonIE(InfoExtractor):
'upload_date': '20161209',
'duration': 304,
'view_count': 0,
- 'tags': ['javascript', 'free'],
+ 'tags': 'count:2',
},
'params': {
'skip_download': True,
@@ -83,13 +91,16 @@ class EggheadLessonIE(InfoExtractor):
}, {
'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application',
'only_matching': True,
+ }, {
+ 'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+ 'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
- lesson = self._download_json(
- 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id)
+ lesson = self._call_api(
+ 'lessons/' + display_id, display_id, 'lesson')
lesson_id = compat_str(lesson['id'])
title = lesson['title']
diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py
index c08643a17..c460dc7f9 100644
--- a/youtube_dl/extractor/eroprofile.py
+++ b/youtube_dl/extractor/eroprofile.py
@@ -6,7 +6,7 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
- unescapeHTML
+ merge_dicts,
)
@@ -24,7 +24,8 @@ class EroProfileIE(InfoExtractor):
'title': 'sexy babe softcore',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
- }
+ },
+ 'skip': 'Video not found',
}, {
'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file',
'md5': '1baa9602ede46ce904c431f5418d8916',
@@ -77,19 +78,15 @@ class EroProfileIE(InfoExtractor):
[r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
webpage, 'video id', default=None)
- video_url = unescapeHTML(self._search_regex(
- r'<source src="([^"]+)', webpage, 'video url'))
title = self._html_search_regex(
- r'Title:</th><td>([^<]+)</td>', webpage, 'title')
- thumbnail = self._search_regex(
- r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
- webpage, 'thumbnail', fatal=False)
+ (r'Title:</th><td>([^<]+)</td>', r'<h1[^>]*>(.+?)</h1>'),
+ webpage, 'title')
+
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
- return {
+ return merge_dicts(info, {
'id': video_id,
'display_id': display_id,
- 'url': video_url,
'title': title,
- 'thumbnail': thumbnail,
'age_limit': 18,
- }
+ })
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index d403a2dbe..4e9954c6a 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -42,7 +42,10 @@ from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
from .amara import AmaraIE
from .amcnetworks import AMCNetworksIE
-from .americastestkitchen import AmericasTestKitchenIE
+from .americastestkitchen import (
+ AmericasTestKitchenIE,
+ AmericasTestKitchenSeasonIE,
+)
from .animeondemand import AnimeOnDemandIE
from .anvato import AnvatoIE
from .aol import AolIE
@@ -69,6 +72,7 @@ from .arte import (
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
+from .arnes import ArnesIE
from .asiancrush import (
AsianCrushIE,
AsianCrushPlaylistIE,
@@ -87,11 +91,13 @@ from .awaan import (
)
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
+from .bandaichannel import BandaiChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
from .bbc import (
BBCCoUkIE,
BBCCoUkArticleIE,
- BBCCoUkIPlayerPlaylistIE,
+ BBCCoUkIPlayerEpisodesIE,
+ BBCCoUkIPlayerGroupIE,
BBCCoUkPlaylistIE,
BBCIE,
)
@@ -126,7 +132,6 @@ from .bleacherreport import (
BleacherReportIE,
BleacherReportCMSIE,
)
-from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bongacams import BongaCamsIE
@@ -160,6 +165,7 @@ from .canvas import (
CanvasIE,
CanvasEenIE,
VrtNUIE,
+ DagelijkseKostIE,
)
from .carambatv import (
CarambaTVIE,
@@ -184,7 +190,11 @@ from .cbsnews import (
CBSNewsIE,
CBSNewsLiveVideoIE,
)
-from .cbssports import CBSSportsIE
+from .cbssports import (
+ CBSSportsEmbedIE,
+ CBSSportsIE,
+ TwentyFourSevenSportsIE,
+)
from .ccc import (
CCCIE,
CCCPlaylistIE,
@@ -232,11 +242,8 @@ from .cnn import (
)
from .coub import CoubIE
from .comedycentral import (
- ComedyCentralFullEpisodesIE,
ComedyCentralIE,
- ComedyCentralShortnameIE,
ComedyCentralTVIE,
- ToshIE,
)
from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
from .commonprotocols import (
@@ -287,7 +294,11 @@ from .douyutv import (
DouyuShowIE,
DouyuTVIE,
)
-from .dplay import DPlayIE
+from .dplay import (
+ DPlayIE,
+ DiscoveryPlusIE,
+ HGTVDeIE,
+)
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
@@ -416,6 +427,7 @@ from .gamestar import GameStarIE
from .gaskrank import GaskrankIE
from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE
+from .gedidigital import GediDigitalIE
from .generic import GenericIE
from .gfycat import GfycatIE
from .giantbomb import GiantBombIE
@@ -470,8 +482,8 @@ from .hungama import (
from .hypem import HypemIE
from .ign import (
IGNIE,
- OneUPIE,
- PCMagIE,
+ IGNVideoIE,
+ IGNArticleIE,
)
from .iheart import (
IHeartRadioIE,
@@ -526,7 +538,10 @@ from .karaoketv import KaraoketvIE
from .karrierevideos import KarriereVideosIE
from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE
-from .khanacademy import KhanAcademyIE
+from .khanacademy import (
+ KhanAcademyIE,
+ KhanAcademyUnitIE,
+)
from .kickstarter import KickStarterIE
from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
@@ -583,7 +598,11 @@ from .limelight import (
LimelightChannelIE,
LimelightChannelListIE,
)
-from .line import LineTVIE
+from .line import (
+ LineTVIE,
+ LineLiveIE,
+ LineLiveChannelIE,
+)
from .linkedin import (
LinkedInLearningIE,
LinkedInLearningCourseIE,
@@ -591,10 +610,6 @@ from .linkedin import (
from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE
from .livejournal import LiveJournalIE
-from .liveleak import (
- LiveLeakIE,
- LiveLeakEmbedIE,
-)
from .livestream import (
LivestreamIE,
LivestreamOriginalIE,
@@ -620,6 +635,7 @@ from .mangomolo import (
MangomoloLiveIE,
)
from .manyvids import ManyVidsIE
+from .maoritv import MaoriTVIE
from .markiza import (
MarkizaIE,
MarkizaPageIE,
@@ -648,6 +664,11 @@ from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE,
)
+from .minds import (
+ MindsIE,
+ MindsChannelIE,
+ MindsGroupIE,
+)
from .ministrygrid import MinistryGridIE
from .minoto import MinotoIE
from .miomio import MioMioIE
@@ -658,7 +679,10 @@ from .mixcloud import (
MixcloudUserIE,
MixcloudPlaylistIE,
)
-from .mlb import MLBIE
+from .mlb import (
+ MLBIE,
+ MLBVideoIE,
+)
from .mnet import MnetIE
from .moevideo import MoeVideoIE
from .mofosex import (
@@ -859,6 +883,11 @@ from .packtpub import (
PacktPubIE,
PacktPubCourseIE,
)
+from .palcomp3 import (
+ PalcoMP3IE,
+ PalcoMP3ArtistIE,
+ PalcoMP3VideoIE,
+)
from .pandoratv import PandoraTVIE
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
@@ -892,6 +921,7 @@ from .platzi import (
from .playfm import PlayFMIE
from .playplustv import PlayPlusTVIE
from .plays import PlaysTVIE
+from .playstuff import PlayStuffIE
from .playtvak import PlaytvakIE
from .playvid import PlayvidIE
from .playwire import PlaywireIE
@@ -1016,6 +1046,7 @@ from .safari import (
SafariApiIE,
SafariCourseIE,
)
+from .samplefocus import SampleFocusIE
from .sapo import SapoIE
from .savefrom import SaveFromIE
from .sbs import SBSIE
@@ -1048,6 +1079,11 @@ from .shared import (
VivoIE,
)
from .showroomlive import ShowRoomLiveIE
+from .simplecast import (
+ SimplecastIE,
+ SimplecastEpisodeIE,
+ SimplecastPodcastIE,
+)
from .sina import SinaIE
from .sixplay import SixPlayIE
from .skyit import (
@@ -1113,6 +1149,10 @@ from .stitcher import (
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE
+from .spotify import (
+ SpotifyIE,
+ SpotifyShowIE,
+)
from .spreaker import (
SpreakerIE,
SpreakerPageIE,
@@ -1128,6 +1168,11 @@ from .srgssr import (
from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
+from .storyfire import (
+ StoryFireIE,
+ StoryFireUserIE,
+ StoryFireSeriesIE,
+)
from .streamable import StreamableIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
@@ -1226,6 +1271,10 @@ from .toutv import TouTvIE
from .toypics import ToypicsUserIE, ToypicsIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
+from .trovo import (
+ TrovoIE,
+ TrovoVodIE,
+)
from .trunews import TruNewsIE
from .trutv import TruTVIE
from .tube8 import Tube8IE
@@ -1244,6 +1293,7 @@ from .tv2 import (
TV2IE,
TV2ArticleIE,
KatsomoIE,
+ MTVUutisetArticleIE,
)
from .tv2dk import (
TV2DKIE,
@@ -1382,7 +1432,6 @@ from .vidme import (
VidmeUserIE,
VidmeUserLikesIE,
)
-from .vidzi import VidziIE
from .vier import VierIE, VierVideosIE
from .viewlift import (
ViewLiftIE,
@@ -1442,6 +1491,7 @@ from .vrv import (
VRVSeriesIE,
)
from .vshare import VShareIE
+from .vtm import VTMIE
from .medialaan import MedialaanIE
from .vube import VubeIE
from .vuclip import VuClipIE
@@ -1585,5 +1635,10 @@ from .zattoo import (
ZattooLiveIE,
)
from .zdf import ZDFIE, ZDFChannelIE
-from .zingmp3 import ZingMp3IE
+from .zhihu import ZhihuIE
+from .zingmp3 import (
+ ZingMp3IE,
+ ZingMp3AlbumIE,
+)
+from .zoom import ZoomIE
from .zype import ZypeIE
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index cb34c59f5..04650af39 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -521,7 +521,10 @@ class FacebookIE(InfoExtractor):
raise ExtractorError(
'The video is not available, Facebook said: "%s"' % m_msg.group(1),
expected=True)
- elif '>You must log in to continue' in webpage:
+ elif any(p in webpage for p in (
+ '>You must log in to continue',
+ 'id="login_form"',
+ 'id="loginbutton"')):
self.raise_login_required()
if not video_data and '/watchparty/' in url:
diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py
index fecfc28ae..67662e6de 100644
--- a/youtube_dl/extractor/formula1.py
+++ b/youtube_dl/extractor/formula1.py
@@ -5,29 +5,23 @@ from .common import InfoExtractor
class Formula1IE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
- _TESTS = [{
- 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
- 'md5': '8c79e54be72078b26b89e0e111c0502b',
+ _VALID_URL = r'https?://(?:www\.)?formula1\.com/en/latest/video\.[^.]+\.(?P<id>\d+)\.html'
+ _TEST = {
+ 'url': 'https://www.formula1.com/en/latest/video.race-highlights-spain-2016.6060988138001.html',
+ 'md5': 'be7d3a8c2f804eb2ab2aa5d941c359f8',
'info_dict': {
- 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
+ 'id': '6060988138001',
'ext': 'mp4',
'title': 'Race highlights - Spain 2016',
+ 'timestamp': 1463332814,
+ 'upload_date': '20160515',
+ 'uploader_id': '6057949432001',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- 'add_ie': ['Ooyala'],
- }, {
- 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html',
- 'only_matching': True,
- }]
+ 'add_ie': ['BrightcoveNew'],
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/6057949432001/S1WMrhjlh_default/index.html?videoId=%s'
def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- ooyala_embed_code = self._search_regex(
- r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
+ bc_id = self._match_id(url)
return self.url_result(
- 'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code)
+ self.BRIGHTCOVE_URL_TEMPLATE % bc_id, 'BrightcoveNew', bc_id)
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
index 306b45fc9..14f4cb489 100644
--- a/youtube_dl/extractor/franceculture.py
+++ b/youtube_dl/extractor/franceculture.py
@@ -11,7 +11,7 @@ from ..utils import (
class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
'info_dict': {
'id': 'rendez-vous-au-pays-des-geeks',
@@ -20,10 +20,14 @@ class FranceCultureIE(InfoExtractor):
'title': 'Rendez-vous au pays des geeks',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20140301',
- 'timestamp': 1393642916,
+ 'timestamp': 1393700400,
'vcodec': 'none',
}
- }
+ }, {
+ # no thumbnail
+ 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
@@ -36,19 +40,19 @@ class FranceCultureIE(InfoExtractor):
</h1>|
<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
).*?
- (<button[^>]+data-asset-source="[^"]+"[^>]+>)
+ (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>)
''',
webpage, 'video data'))
- video_url = video_data['data-asset-source']
- title = video_data.get('data-asset-title') or self._og_search_title(webpage)
+ video_url = video_data.get('data-url') or video_data['data-asset-source']
+ title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage)
description = self._html_search_regex(
r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>',
webpage, 'description', default=None)
thumbnail = self._search_regex(
r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ webpage, 'thumbnail', default=None)
uploader = self._html_search_regex(
r'(?s)<span class="author">(.*?)</span>',
webpage, 'uploader', default=None)
@@ -64,6 +68,6 @@ class FranceCultureIE(InfoExtractor):
'ext': ext,
'vcodec': 'none' if ext == 'mp3' else None,
'uploader': uploader,
- 'timestamp': int_or_none(video_data.get('data-asset-created-date')),
+ 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')),
'duration': int_or_none(video_data.get('data-duration')),
}
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 3ca415077..e4ec2e200 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -383,6 +383,10 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
}, {
'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
'only_matching': True,
+ }, {
+ # "<figure id=" pattern (#28792)
+ 'url': 'https://www.francetvinfo.fr/culture/patrimoine/incendie-de-notre-dame-de-paris/notre-dame-de-paris-de-l-incendie-de-la-cathedrale-a-sa-reconstruction_4372291.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -399,7 +403,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
video_id = self._search_regex(
(r'player\.load[^;]+src:\s*["\']([^"\']+)',
r'id-video=([^@]+@[^"]+)',
- r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'),
+ r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+ r'(?:data-id|<figure[^<]+\bid)=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'),
webpage, 'video id')
return self._make_url_result(video_id)
diff --git a/youtube_dl/extractor/fujitv.py b/youtube_dl/extractor/fujitv.py
index 39685e075..a02a94374 100644
--- a/youtube_dl/extractor/fujitv.py
+++ b/youtube_dl/extractor/fujitv.py
@@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
formats = self._extract_m3u8_formats(
- self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id)
+ self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4')
for f in formats:
wh = self._BITRATE_MAP.get(f.get('tbr'))
if wh:
diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py
index 8bbedca26..d8f1e169a 100644
--- a/youtube_dl/extractor/funimation.py
+++ b/youtube_dl/extractor/funimation.py
@@ -16,7 +16,7 @@ from ..utils import (
class FunimationIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/(?:[^/]+/)?shows/[^/]+/(?P<id>[^/?#&]+)'
_NETRC_MACHINE = 'funimation'
_TOKEN = None
@@ -51,6 +51,10 @@ class FunimationIE(InfoExtractor):
}, {
'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
'only_matching': True,
+ }, {
+ # with lang code
+ 'url': 'https://www.funimation.com/en/shows/hacksign/role-play/',
+ 'only_matching': True,
}]
def _login(self):
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 2f555c1d4..acc6478b8 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -6,6 +6,7 @@ from .common import InfoExtractor
from .kaltura import KalturaIE
from ..utils import (
HEADRequest,
+ remove_start,
sanitized_Request,
smuggle_url,
urlencode_postdata,
@@ -102,6 +103,26 @@ class GDCVaultIE(InfoExtractor):
'format': 'mp4-408',
},
},
+ {
+ # Kaltura embed, whitespace between quote and embedded URL in iframe's src
+ 'url': 'https://www.gdcvault.com/play/1025699',
+ 'info_dict': {
+ 'id': '0_zagynv0a',
+ 'ext': 'mp4',
+ 'title': 'Tech Toolbox',
+ 'upload_date': '20190408',
+ 'uploader_id': 'joe@blazestreaming.com',
+ 'timestamp': 1554764629,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # HTML5 video
+ 'url': 'http://www.gdcvault.com/play/1014846/Conference-Keynote-Shigeru',
+ 'only_matching': True,
+ },
]
def _login(self, webpage_url, display_id):
@@ -175,7 +196,18 @@ class GDCVaultIE(InfoExtractor):
xml_name = self._html_search_regex(
r'<iframe src=".*?\?xml(?:=|URL=xml/)(.+?\.xml).*?".*?</iframe>',
- start_page, 'xml filename')
+ start_page, 'xml filename', default=None)
+ if not xml_name:
+ info = self._parse_html5_media_entries(url, start_page, video_id)[0]
+ info.update({
+ 'title': remove_start(self._search_regex(
+ r'>Session Name:\s*<.*?>\s*<td>(.+?)</td>', start_page,
+ 'title', default=None) or self._og_search_title(
+ start_page, default=None), 'GDC Vault - '),
+ 'id': video_id,
+ 'display_id': display_id,
+ })
+ return info
embed_url = '%s/xml/%s' % (xml_root, xml_name)
ie_key = 'DigitallySpeaking'
diff --git a/youtube_dl/extractor/gedidigital.py b/youtube_dl/extractor/gedidigital.py
new file mode 100644
index 000000000..6c4153b40
--- /dev/null
+++ b/youtube_dl/extractor/gedidigital.py
@@ -0,0 +1,161 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+)
+
+
+class GediDigitalIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://video\.
+ (?:
+ (?:
+ (?:espresso\.)?repubblica
+ |lastampa
+ |ilsecoloxix
+ )|
+ (?:
+ iltirreno
+ |messaggeroveneto
+ |ilpiccolo
+ |gazzettadimantova
+ |mattinopadova
+ |laprovinciapavese
+ |tribunatreviso
+ |nuovavenezia
+ |gazzettadimodena
+ |lanuovaferrara
+ |corrierealpi
+ |lasentinella
+ )\.gelocal
+ )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)'''
+ _TESTS = [{
+ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683',
+ 'md5': '84658d7fb9e55a6e57ecc77b73137494',
+ 'info_dict': {
+ 'id': '121559',
+ 'ext': 'mp4',
+ 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso',
+ 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca',
+ 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$',
+ 'duration': 125,
+ },
+ }, {
+ 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_meta(
+ ['twitter:title', 'og:title'], webpage, fatal=True)
+ player_data = re.findall(
+ r"PlayerFactory\.setParam\('(?P<type>format|param)',\s*'(?P<name>[^']+)',\s*'(?P<val>[^']+)'\);",
+ webpage)
+
+ formats = []
+ duration = thumb = None
+ for t, n, v in player_data:
+ if t == 'format':
+ if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'):
+ continue
+ elif n.endswith('-vod-ak'):
+ formats.extend(self._extract_akamai_formats(
+ v, video_id, {'http': 'media.gedidigital.it'}))
+ else:
+ ext = determine_ext(v)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False))
+ continue
+ f = {
+ 'format_id': n,
+ 'url': v,
+ }
+ if ext == 'mp3':
+ abr = int_or_none(self._search_regex(
+ r'-mp3-audio-(\d+)', v, 'abr', default=None))
+ f.update({
+ 'abr': abr,
+ 'tbr': abr,
+ 'vcodec': 'none'
+ })
+ else:
+ mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n)
+ if mobj:
+ f.update({
+ 'height': int(mobj.group(1)),
+ 'vbr': int_or_none(mobj.group(2)),
+ })
+ if not f.get('vbr'):
+ f['vbr'] = int_or_none(self._search_regex(
+ r'-video-rrtv-(\d+)', v, 'abr', default=None))
+ formats.append(f)
+ elif t == 'param':
+ if n in ['image_full', 'image']:
+ thumb = v
+ elif n == 'videoDuration':
+ duration = int_or_none(v)
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': self._html_search_meta(
+ ['twitter:description', 'og:description', 'description'], webpage),
+ 'thumbnail': thumb or self._og_search_thumbnail(webpage),
+ 'formats': formats,
+ 'duration': duration,
+ }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 780971a92..a9c064105 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -84,7 +84,6 @@ from .jwplatform import JWPlatformIE
from .digiteka import DigitekaIE
from .arkena import ArkenaIE
from .instagram import InstagramIE
-from .liveleak import LiveLeakIE
from .threeqsdn import ThreeQSDNIE
from .theplatform import ThePlatformIE
from .kaltura import KalturaIE
@@ -126,8 +125,11 @@ from .viqeo import ViqeoIE
from .expressen import ExpressenIE
from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE
+from .vk import VKIE
from .kinja import KinjaEmbedIE
from .arcpublishing import ArcPublishingIE
+from .medialaan import MedialaanIE
+from .simplecast import SimplecastIE
class GenericIE(InfoExtractor):
@@ -1626,31 +1628,6 @@ class GenericIE(InfoExtractor):
'upload_date': '20160409',
},
},
- # LiveLeak embed
- {
- 'url': 'http://www.wykop.pl/link/3088787/',
- 'md5': '7619da8c820e835bef21a1efa2a0fc71',
- 'info_dict': {
- 'id': '874_1459135191',
- 'ext': 'mp4',
- 'title': 'Man shows poor quality of new apartment building',
- 'description': 'The wall is like a sand pile.',
- 'uploader': 'Lake8737',
- },
- 'add_ie': [LiveLeakIE.ie_key()],
- },
- # Another LiveLeak embed pattern (#13336)
- {
- 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
- 'info_dict': {
- 'id': '2eb_1496309988',
- 'ext': 'mp4',
- 'title': 'Thief robs place where everyone was armed',
- 'description': 'md5:694d73ee79e535953cf2488562288eee',
- 'uploader': 'brazilwtf',
- },
- 'add_ie': [LiveLeakIE.ie_key()],
- },
# Duplicated embedded video URLs
{
'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
@@ -2223,6 +2200,34 @@ class GenericIE(InfoExtractor):
'duration': 1581,
},
},
+ {
+ # MyChannels SDK embed
+ # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen
+ 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/',
+ 'md5': '90c0699c37006ef18e198c032d81739c',
+ 'info_dict': {
+ 'id': '194165',
+ 'ext': 'mp4',
+ 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe',
+ 'timestamp': 1611740340,
+ 'upload_date': '20210127',
+ 'duration': 159,
+ },
+ },
+ {
+ # Simplecast player embed
+ 'url': 'https://www.bio.org/podcast',
+ 'info_dict': {
+ 'id': 'podcast',
+ 'title': 'I AM BIO Podcast | BIO',
+ },
+ 'playlist_mincount': 52,
+ },
+ {
+ # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
+ 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
+ 'only_matching': True,
+ },
]
def report_following_redirect(self, new_url):
@@ -2462,6 +2467,9 @@ class GenericIE(InfoExtractor):
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
+ if '<title>DPG Media Privacy Gate</title>' in webpage:
+ webpage = self._download_webpage(url, video_id)
+
self.report_extraction(video_id)
# Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
@@ -2593,6 +2601,11 @@ class GenericIE(InfoExtractor):
if arc_urls:
return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
+ mychannels_urls = MedialaanIE._extract_urls(webpage)
+ if mychannels_urls:
+ return self.playlist_from_matches(
+ mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key())
+
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
@@ -2744,6 +2757,11 @@ class GenericIE(InfoExtractor):
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
+ # Look for sibnet embedded player
+ sibnet_urls = VKIE._extract_sibnet_urls(webpage)
+ if sibnet_urls:
+ return self.playlist_from_matches(sibnet_urls, video_id, video_title)
+
# Look for embedded ivi player
mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
if mobj is not None:
@@ -2769,6 +2787,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for Simplecast embeds
+ simplecast_urls = SimplecastIE._extract_urls(webpage)
+ if simplecast_urls:
+ return self.playlist_from_matches(
+ simplecast_urls, video_id, video_title)
+
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
@@ -2914,7 +2938,7 @@ class GenericIE(InfoExtractor):
webpage)
if not mobj:
mobj = re.search(
- r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
+ r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
@@ -3129,11 +3153,6 @@ class GenericIE(InfoExtractor):
return self.url_result(
self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
- # Look for LiveLeak embeds
- liveleak_urls = LiveLeakIE._extract_urls(webpage)
- if liveleak_urls:
- return self.playlist_from_matches(liveleak_urls, video_id, video_title)
-
# Look for 3Q SDN embeds
threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
if threeqsdn_url:
@@ -3361,6 +3380,9 @@ class GenericIE(InfoExtractor):
'url': src,
'ext': (mimetype2ext(src_type)
or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
+ 'http_headers': {
+ 'Referer': full_response.geturl(),
+ },
})
if formats:
self._sort_formats(formats)
@@ -3429,7 +3451,7 @@ class GenericIE(InfoExtractor):
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
- found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
+ found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
if not found:
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 0d731e90a..878ba14e6 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
import re
from .adobepass import AdobePassIE
+from ..compat import compat_str
from ..utils import (
int_or_none,
determine_ext,
parse_age_limit,
+ try_get,
urlencode_postdata,
ExtractorError,
)
@@ -117,6 +119,18 @@ class GoIE(AdobePassIE):
'skip_download': True,
},
}, {
+ 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot',
+ 'info_dict': {
+ 'id': 'VDKA22600213',
+ 'ext': 'mp4',
+ 'title': 'Pilot',
+ 'description': 'md5:74306df917cfc199d76d061d66bebdb4',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
}, {
@@ -149,14 +163,30 @@ class GoIE(AdobePassIE):
brand = site_info.get('brand')
if not video_id or not site_info:
webpage = self._download_webpage(url, display_id or video_id)
- video_id = self._search_regex(
- (
- # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
- # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*(VDKA\w+)',
- # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet
- r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)'
- ), webpage, 'video id', default=video_id)
+ data = self._parse_json(
+ self._search_regex(
+ r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage,
+ 'data', default='{}'),
+ display_id or video_id, fatal=False)
+ # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot
+ layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict)
+ video_id = None
+ if layout:
+ video_id = try_get(
+ layout,
+ (lambda x: x['videoid'], lambda x: x['video']['id']),
+ compat_str)
+ if not video_id:
+ video_id = self._search_regex(
+ (
+ # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
+ # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
+ r'data-video-id=["\']*(VDKA\w+)',
+ # page.analytics.videoIdCode
+ r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)',
+ # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet
+ r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)'
+ ), webpage, 'video id', default=video_id)
if not site_info:
brand = self._search_regex(
(r'data-brand=\s*["\']\s*(\d+)',
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py
index de8c80e36..3f2de00f1 100644
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -7,6 +7,7 @@ from ..compat import compat_parse_qs
from ..utils import (
determine_ext,
ExtractorError,
+ get_element_by_class,
int_or_none,
lowercase_escape,
try_get,
@@ -237,7 +238,7 @@ class GoogleDriveIE(InfoExtractor):
if confirmation_webpage:
confirm = self._search_regex(
r'confirm=([^&"\']+)', confirmation_webpage,
- 'confirmation code', fatal=False)
+ 'confirmation code', default=None)
if confirm:
confirmed_source_url = update_url_query(source_url, {
'confirm': confirm,
@@ -245,6 +246,11 @@ class GoogleDriveIE(InfoExtractor):
urlh = request_source_file(confirmed_source_url, 'confirmed source')
if urlh and urlh.headers.get('Content-Disposition'):
add_source_format(urlh)
+ else:
+ self.report_warning(
+ get_element_by_class('uc-error-subcaption', confirmation_webpage)
+ or get_element_by_class('uc-error-caption', confirmation_webpage)
+ or 'unable to extract confirmation code')
if not formats and reason:
raise ExtractorError(reason, expected=True)
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py
index a96ea8010..0d9f50ed2 100644
--- a/youtube_dl/extractor/ign.py
+++ b/youtube_dl/extractor/ign.py
@@ -3,230 +3,255 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
+ HEADRequest,
+ determine_ext,
int_or_none,
parse_iso8601,
+ strip_or_none,
+ try_get,
)
-class IGNIE(InfoExtractor):
+class IGNBaseIE(InfoExtractor):
+ def _call_api(self, slug):
+ return self._download_json(
+ 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug)
+
+
+class IGNIE(IGNBaseIE):
"""
Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
Some videos of it.ign.com are also supported
"""
- _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)'
+ _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P<id>[^/?&#]+)'
IE_NAME = 'ign.com'
+ _PAGE_TYPE = 'video'
- _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s'
- _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']'
-
- _TESTS = [
- {
- 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
- 'md5': 'febda82c4bafecd2d44b6e1a18a595f8',
- 'info_dict': {
- 'id': '8f862beef863986b2785559b9e1aa599',
- 'ext': 'mp4',
- 'title': 'The Last of Us Review',
- 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
- 'timestamp': 1370440800,
- 'upload_date': '20130605',
- 'uploader_id': 'cberidon@ign.com',
- }
- },
- {
- 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
- 'info_dict': {
- 'id': '100-little-things-in-gta-5-that-will-blow-your-mind',
- },
- 'playlist': [
- {
- 'info_dict': {
- 'id': '5ebbd138523268b93c9141af17bec937',
- 'ext': 'mp4',
- 'title': 'GTA 5 Video Review',
- 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
- 'timestamp': 1379339880,
- 'upload_date': '20130916',
- 'uploader_id': 'danieljkrupa@gmail.com',
- },
- },
- {
- 'info_dict': {
- 'id': '638672ee848ae4ff108df2a296418ee2',
- 'ext': 'mp4',
- 'title': '26 Twisted Moments from GTA 5 in Slow Motion',
- 'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
- 'timestamp': 1386878820,
- 'upload_date': '20131212',
- 'uploader_id': 'togilvie@ign.com',
- },
- },
- ],
- 'params': {
- 'skip_download': True,
- },
- },
- {
- 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
- 'md5': '618fedb9c901fd086f6f093564ef8558',
- 'info_dict': {
- 'id': '078fdd005f6d3c02f63d795faa1b984f',
- 'ext': 'mp4',
- 'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
- 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.',
- 'timestamp': 1408047180,
- 'upload_date': '20140814',
- 'uploader_id': 'jamesduggan1990@gmail.com',
- },
- },
- {
- 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
- 'only_matching': True,
- },
- {
- 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
- 'only_matching': True,
- },
- {
- # videoId pattern
- 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
- 'only_matching': True,
- },
- ]
-
- def _find_video_id(self, webpage):
- res_id = [
- r'"video_id"\s*:\s*"(.*?)"',
- r'class="hero-poster[^"]*?"[^>]*id="(.+?)"',
- r'data-video-id="(.+?)"',
- r'<object id="vid_(.+?)"',
- r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
- r'videoId&quot;\s*:\s*&quot;(.+?)&quot;',
- r'videoId["\']\s*:\s*["\']([^"\']+?)["\']',
- ]
- return self._search_regex(res_id, webpage, 'video id', default=None)
+ _TESTS = [{
+ 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+ 'md5': 'd2e1586d9987d40fad7867bf96a018ea',
+ 'info_dict': {
+ 'id': '8f862beef863986b2785559b9e1aa599',
+ 'ext': 'mp4',
+ 'title': 'The Last of Us Review',
+ 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
+ 'timestamp': 1370440800,
+ 'upload_date': '20130605',
+ 'tags': 'count:9',
+ }
+ }, {
+ 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
+ 'md5': 'f1581a6fe8c5121be5b807684aeac3f6',
+ 'info_dict': {
+ 'id': 'ee10d774b508c9b8ec07e763b9125b91',
+ 'ext': 'mp4',
+ 'title': 'What\'s New Now: Is GoGo Snooping on Your Data?',
+ 'description': 'md5:817a20299de610bd56f13175386da6fa',
+ 'timestamp': 1420571160,
+ 'upload_date': '20150106',
+ 'tags': 'count:4',
+ }
+ }, {
+ 'url': 'https://www.ign.com/videos/is-a-resident-evil-4-remake-on-the-way-ign-daily-fix',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name_or_id = mobj.group('name_or_id')
- page_type = mobj.group('type')
- webpage = self._download_webpage(url, name_or_id)
- if page_type != 'video':
- multiple_urls = re.findall(
- r'<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]',
- webpage)
- if multiple_urls:
- entries = [self.url_result(u, ie='IGN') for u in multiple_urls]
- return {
- '_type': 'playlist',
- 'id': name_or_id,
- 'entries': entries,
- }
-
- video_id = self._find_video_id(webpage)
- if not video_id:
- return self.url_result(self._search_regex(
- self._EMBED_RE, webpage, 'embed url'))
- return self._get_video_info(video_id)
-
- def _get_video_info(self, video_id):
- api_data = self._download_json(
- self._API_URL_TEMPLATE % video_id, video_id)
+ display_id = self._match_id(url)
+ video = self._call_api(display_id)
+ video_id = video['videoId']
+ metadata = video['metadata']
+ title = metadata.get('longTitle') or metadata.get('title') or metadata['name']
formats = []
- m3u8_url = api_data['refs'].get('m3uUrl')
+ refs = video.get('refs') or {}
+
+ m3u8_url = refs.get('m3uUrl')
if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
- f4m_url = api_data['refs'].get('f4mUrl')
+
+ f4m_url = refs.get('f4mUrl')
if f4m_url:
formats.extend(self._extract_f4m_formats(
f4m_url, video_id, f4m_id='hds', fatal=False))
- for asset in api_data['assets']:
+
+ for asset in (video.get('assets') or []):
+ asset_url = asset.get('url')
+ if not asset_url:
+ continue
formats.append({
- 'url': asset['url'],
- 'tbr': asset.get('actual_bitrate_kbps'),
- 'fps': asset.get('frame_rate'),
+ 'url': asset_url,
+ 'tbr': int_or_none(asset.get('bitrate'), 1000),
+ 'fps': int_or_none(asset.get('frame_rate')),
'height': int_or_none(asset.get('height')),
'width': int_or_none(asset.get('width')),
})
+
+ mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl'])
+ if mezzanine_url:
+ formats.append({
+ 'ext': determine_ext(mezzanine_url, 'mp4'),
+ 'format_id': 'mezzanine',
+ 'preference': 1,
+ 'url': mezzanine_url,
+ })
+
self._sort_formats(formats)
- thumbnails = [{
- 'url': thumbnail['url']
- } for thumbnail in api_data.get('thumbnails', [])]
+ thumbnails = []
+ for thumbnail in (video.get('thumbnails') or []):
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ })
- metadata = api_data['metadata']
+ tags = []
+ for tag in (video.get('tags') or []):
+ display_name = tag.get('displayName')
+ if not display_name:
+ continue
+ tags.append(display_name)
return {
- 'id': api_data.get('videoId') or video_id,
- 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'],
- 'description': metadata.get('description'),
+ 'id': video_id,
+ 'title': title,
+ 'description': strip_or_none(metadata.get('description')),
'timestamp': parse_iso8601(metadata.get('publishDate')),
'duration': int_or_none(metadata.get('duration')),
- 'display_id': metadata.get('slug') or video_id,
- 'uploader_id': metadata.get('creator'),
+ 'display_id': display_id,
'thumbnails': thumbnails,
'formats': formats,
+ 'tags': tags,
}
-class OneUPIE(IGNIE):
- _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html'
- IE_NAME = '1up.com'
-
+class IGNVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P<id>\d+)/(?:video|trailer)/'
_TESTS = [{
- 'url': 'http://gamevideos.1up.com/video/id/34976.html',
- 'md5': 'c9cc69e07acb675c31a16719f909e347',
+ 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s',
+ 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1',
'info_dict': {
- 'id': '34976',
+ 'id': 'e9be7ea899a9bbfc0674accc22a36cc8',
'ext': 'mp4',
- 'title': 'Sniper Elite V2 - Trailer',
- 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826',
- 'timestamp': 1313099220,
- 'upload_date': '20110811',
- 'uploader_id': 'IGN',
+ 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015',
+ 'description': 'Taking out assassination targets in Hitman has never been more stylish.',
+ 'timestamp': 1444665600,
+ 'upload_date': '20151012',
}
+ }, {
+ 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds',
+ 'only_matching': True,
+ }, {
+ # Youtube embed
+ 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed',
+ 'only_matching': True,
+ }, {
+ # Twitter embed
+ 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed',
+ 'only_matching': True,
+ }, {
+ # Vimeo embed
+ 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- result = super(OneUPIE, self)._real_extract(url)
- result['id'] = mobj.group('name_or_id')
- return result
-
-
-class PCMagIE(IGNIE):
- _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)'
- IE_NAME = 'pcmag'
+ video_id = self._match_id(url)
+ req = HEADRequest(url.rsplit('/', 1)[0] + '/embed')
+ url = self._request_webpage(req, video_id).geturl()
+ ign_url = compat_parse_qs(
+ compat_urllib_parse_urlparse(url).query).get('url', [None])[0]
+ if ign_url:
+ return self.url_result(ign_url, IGNIE.ie_key())
+ return self.url_result(url)
- _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]'
+class IGNArticleIE(IGNBaseIE):
+ _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P<id>[^/?&#]+)'
+ _PAGE_TYPE = 'article'
_TESTS = [{
- 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data',
- 'md5': '212d6154fd0361a2781075f1febbe9ad',
+ 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
'info_dict': {
- 'id': 'ee10d774b508c9b8ec07e763b9125b91',
- 'ext': 'mp4',
- 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?',
- 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3',
- 'timestamp': 1420571160,
- 'upload_date': '20150106',
- 'uploader_id': 'cozzipix@gmail.com',
- }
+ 'id': '524497489e4e8ff5848ece34',
+ 'title': '100 Little Things in GTA 5 That Will Blow Your Mind',
+ },
+ 'playlist': [
+ {
+ 'info_dict': {
+ 'id': '5ebbd138523268b93c9141af17bec937',
+ 'ext': 'mp4',
+ 'title': 'GTA 5 Video Review',
+ 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
+ 'timestamp': 1379339880,
+ 'upload_date': '20130916',
+ },
+ },
+ {
+ 'info_dict': {
+ 'id': '638672ee848ae4ff108df2a296418ee2',
+ 'ext': 'mp4',
+ 'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+ 'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
+ 'timestamp': 1386878820,
+ 'upload_date': '20131212',
+ },
+ },
+ ],
+ 'params': {
+ 'playlist_items': '2-3',
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp',
- 'md5': '94130c1ca07ba0adb6088350681f16c1',
+ 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch',
'info_dict': {
- 'id': '042e560ba94823d43afcb12ddf7142ca',
- 'ext': 'mp4',
- 'title': 'HTC\'s Weird New Re Camera - What\'s New Now',
- 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908',
- 'timestamp': 1412953920,
- 'upload_date': '20141010',
- 'uploader_id': 'chris_snyder@pcmag.com',
- }
+ 'id': '53ee806780a81ec46e0790f8',
+ 'title': 'Rewind Theater - Wild Trailer Gamescom 2014',
+ },
+ 'playlist_count': 2,
+ }, {
+ # videoId pattern
+ 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned',
+ 'only_matching': True,
+ }, {
+ # Youtube embed
+ 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii',
+ 'only_matching': True,
+ }, {
+ # IMDB embed
+ 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer',
+ 'only_matching': True,
+ }, {
+ # Facebook embed
+ 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series',
+ 'only_matching': True,
+ }, {
+ # Brightcove embed
+ 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip',
+ 'only_matching': True,
}]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ article = self._call_api(display_id)
+
+ def entries():
+ media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url'])
+ if media_url:
+ yield self.url_result(media_url, IGNIE.ie_key())
+ for content in (article.get('content') or []):
+ for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|<iframe[^>]+src)="([^"]+)"', content):
+ yield self.url_result(video_url)
+
+ return self.playlist_result(
+ entries(), article.get('articleId'),
+ strip_or_none(try_get(article, lambda x: x['metadata']['headline'])))
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index 1eeddc3b6..12e10143c 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -12,6 +12,7 @@ from ..compat import (
)
from ..utils import (
ExtractorError,
+ float_or_none,
get_element_by_attribute,
int_or_none,
lowercase_escape,
@@ -32,6 +33,7 @@ class InstagramIE(InfoExtractor):
'title': 'Video by naomipq',
'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 0,
'timestamp': 1371748545,
'upload_date': '20130620',
'uploader_id': 'naomipq',
@@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor):
'ext': 'mp4',
'title': 'Video by britneyspears',
'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 0,
'timestamp': 1453760977,
'upload_date': '20160125',
'uploader_id': 'britneyspears',
@@ -87,6 +90,24 @@ class InstagramIE(InfoExtractor):
'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
},
}, {
+ # IGTV
+ 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
+ 'info_dict': {
+ 'id': 'BkfuX9UB-eK',
+ 'ext': 'mp4',
+ 'title': 'Fingerboarding Tricks with @cass.fb',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 53.83,
+ 'timestamp': 1530032919,
+ 'upload_date': '20180626',
+ 'uploader_id': 'instagram',
+ 'uploader': 'Instagram',
+ 'like_count': int,
+ 'comment_count': int,
+ 'comments': list,
+ 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
+ }
+ }, {
'url': 'https://instagram.com/p/-Cmh1cukG2/',
'only_matching': True,
}, {
@@ -159,7 +180,9 @@ class InstagramIE(InfoExtractor):
description = try_get(
media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
compat_str) or media.get('caption')
+ title = media.get('title')
thumbnail = media.get('display_src') or media.get('display_url')
+ duration = float_or_none(media.get('video_duration'))
timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
uploader = media.get('owner', {}).get('full_name')
uploader_id = media.get('owner', {}).get('username')
@@ -200,9 +223,10 @@ class InstagramIE(InfoExtractor):
continue
entries.append({
'id': node.get('shortcode') or node['id'],
- 'title': 'Video %d' % edge_num,
+ 'title': node.get('title') or 'Video %d' % edge_num,
'url': node_video_url,
'thumbnail': node.get('display_url'),
+ 'duration': float_or_none(node.get('video_duration')),
'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
'view_count': int_or_none(node.get('video_view_count')),
@@ -239,8 +263,9 @@ class InstagramIE(InfoExtractor):
'id': video_id,
'formats': formats,
'ext': 'mp4',
- 'title': 'Video by %s' % uploader_id,
+ 'title': title or 'Video by %s' % uploader_id,
'description': description,
+ 'duration': duration,
'thumbnail': thumbnail,
'timestamp': timestamp,
'uploader_id': uploader_id,
diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py
index 490efa8fb..1db7c64af 100644
--- a/youtube_dl/extractor/jamendo.py
+++ b/youtube_dl/extractor/jamendo.py
@@ -29,34 +29,51 @@ class JamendoIE(InfoExtractor):
'id': '196219',
'display_id': 'stories-from-emona-i',
'ext': 'flac',
- 'title': 'Maya Filipič - Stories from Emona I',
- 'artist': 'Maya Filipič',
+ # 'title': 'Maya Filipič - Stories from Emona I',
+ 'title': 'Stories from Emona I',
+ # 'artist': 'Maya Filipič',
'track': 'Stories from Emona I',
'duration': 210,
'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1217438117,
'upload_date': '20080730',
+ 'license': 'by-nc-nd',
+ 'view_count': int,
+ 'like_count': int,
+ 'average_rating': int,
+ 'tags': ['piano', 'peaceful', 'newage', 'strings', 'upbeat'],
}
}, {
'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
'only_matching': True,
}]
+ def _call_api(self, resource, resource_id):
+ path = '/api/%ss' % resource
+ rand = compat_str(random.random())
+ return self._download_json(
+ 'https://www.jamendo.com' + path, resource_id, query={
+ 'id[]': resource_id,
+ }, headers={
+ 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
+ })[0]
+
def _real_extract(self, url):
track_id, display_id = self._VALID_URL_RE.match(url).groups()
- webpage = self._download_webpage(
- 'https://www.jamendo.com/track/' + track_id, track_id)
- models = self._parse_json(self._html_search_regex(
- r"data-bundled-models='([^']+)",
- webpage, 'bundled models'), track_id)
- track = models['track']['models'][0]
+ # webpage = self._download_webpage(
+ # 'https://www.jamendo.com/track/' + track_id, track_id)
+ # models = self._parse_json(self._html_search_regex(
+ # r"data-bundled-models='([^']+)",
+ # webpage, 'bundled models'), track_id)
+ # track = models['track']['models'][0]
+ track = self._call_api('track', track_id)
title = track_name = track['name']
- get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
- artist = get_model('artist')
- artist_name = artist.get('name')
- if artist_name:
- title = '%s - %s' % (artist_name, title)
- album = get_model('album')
+ # get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
+ # artist = get_model('artist')
+ # artist_name = artist.get('name')
+ # if artist_name:
+ # title = '%s - %s' % (artist_name, title)
+ # album = get_model('album')
formats = [{
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@@ -74,7 +91,7 @@ class JamendoIE(InfoExtractor):
urls = []
thumbnails = []
- for _, covers in track.get('cover', {}).items():
+ for covers in (track.get('cover') or {}).values():
for cover_id, cover_url in covers.items():
if not cover_url or cover_url in urls:
continue
@@ -88,13 +105,14 @@ class JamendoIE(InfoExtractor):
})
tags = []
- for tag in track.get('tags', []):
+ for tag in (track.get('tags') or []):
tag_name = tag.get('name')
if not tag_name:
continue
tags.append(tag_name)
stats = track.get('stats') or {}
+ license = track.get('licenseCC') or []
return {
'id': track_id,
@@ -103,11 +121,11 @@ class JamendoIE(InfoExtractor):
'title': title,
'description': track.get('description'),
'duration': int_or_none(track.get('duration')),
- 'artist': artist_name,
+ # 'artist': artist_name,
'track': track_name,
- 'album': album.get('name'),
+ # 'album': album.get('name'),
'formats': formats,
- 'license': '-'.join(track.get('licenseCC', [])) or None,
+ 'license': '-'.join(license) if license else None,
'timestamp': int_or_none(track.get('dateCreated')),
'view_count': int_or_none(stats.get('listenedAll')),
'like_count': int_or_none(stats.get('favorited')),
@@ -116,9 +134,9 @@ class JamendoIE(InfoExtractor):
}
-class JamendoAlbumIE(InfoExtractor):
+class JamendoAlbumIE(JamendoIE):
_VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
'info_dict': {
'id': '121486',
@@ -151,17 +169,7 @@ class JamendoAlbumIE(InfoExtractor):
'params': {
'playlistend': 2
}
- }
-
- def _call_api(self, resource, resource_id):
- path = '/api/%ss' % resource
- rand = compat_str(random.random())
- return self._download_json(
- 'https://www.jamendo.com' + path, resource_id, query={
- 'id[]': resource_id,
- }, headers={
- 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
- })[0]
+ }]
def _real_extract(self, url):
album_id = self._match_id(url)
@@ -169,7 +177,7 @@ class JamendoAlbumIE(InfoExtractor):
album_name = album.get('name')
entries = []
- for track in album.get('tracks', []):
+ for track in (album.get('tracks') or []):
track_id = track.get('id')
if not track_id:
continue
diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py
index 32935bb28..31ce7a85c 100644
--- a/youtube_dl/extractor/kakao.py
+++ b/youtube_dl/extractor/kakao.py
@@ -3,10 +3,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import compat_HTTPError
from ..utils import (
+ ExtractorError,
int_or_none,
+ str_or_none,
strip_or_none,
+ try_get,
unified_timestamp,
update_url_query,
)
@@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor):
'id': '301965083',
'ext': 'mp4',
'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』',
- 'uploader_id': 2671005,
+ 'uploader_id': '2671005',
'uploader': '그랑그랑이',
'timestamp': 1488160199,
'upload_date': '20170227',
@@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor):
'ext': 'mp4',
'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
- 'uploader_id': 2653210,
+ 'uploader_id': '2653210',
'uploader': '쇼! 음악중심',
'timestamp': 1485684628,
'upload_date': '20170129',
}
+ }, {
+ # geo restricted
+ 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor):
'fields': ','.join([
'-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title',
'description', 'channelId', 'createTime', 'duration', 'playCount',
- 'likeCount', 'commentCount', 'tagList', 'channel', 'name',
- 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault',
+ 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl',
'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label'])
}
@@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor):
title = clip.get('title') or clip_link.get('displayTitle')
- query['tid'] = impress.get('tid', '')
+ query.update({
+ 'fields': '-*,code,message,url',
+ 'tid': impress.get('tid') or '',
+ })
formats = []
- for fmt in clip.get('videoOutputList', []):
+ for fmt in (clip.get('videoOutputList') or []):
try:
profile_name = fmt['profile']
if profile_name == 'AUDIO':
continue
- query.update({
- 'profile': profile_name,
- 'fields': '-*,url',
- })
- fmt_url_json = self._download_json(
- api_base + 'raw/videolocation', display_id,
- 'Downloading video URL for profile %s' % profile_name,
- query=query, headers=player_header, fatal=False)
-
- if fmt_url_json is None:
+ query['profile'] = profile_name
+ try:
+ fmt_url_json = self._download_json(
+ api_base + 'raw/videolocation', display_id,
+ 'Downloading video URL for profile %s' % profile_name,
+ query=query, headers=player_header)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ resp = self._parse_json(e.cause.read().decode(), video_id)
+ if resp.get('code') == 'GeoBlocked':
+ self.raise_geo_restricted()
continue
fmt_url = fmt_url_json['url']
@@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor):
pass
self._sort_formats(formats)
- thumbs = []
- for thumb in clip.get('clipChapterThumbnailList', []):
- thumbs.append({
- 'url': thumb.get('thumbnailUrl'),
- 'id': compat_str(thumb.get('timeInSec')),
- 'preference': -1 if thumb.get('isDefault') else 0
- })
- top_thumbnail = clip.get('thumbnailUrl')
- if top_thumbnail:
- thumbs.append({
- 'url': top_thumbnail,
- 'preference': 10,
- })
-
return {
'id': display_id,
'title': title,
'description': strip_or_none(clip.get('description')),
- 'uploader': clip_link.get('channel', {}).get('name'),
- 'uploader_id': clip_link.get('channelId'),
- 'thumbnails': thumbs,
+ 'uploader': try_get(clip_link, lambda x: x['channel']['name']),
+ 'uploader_id': str_or_none(clip_link.get('channelId')),
+ 'thumbnail': clip.get('thumbnailUrl'),
'timestamp': unified_timestamp(clip_link.get('createTime')),
'duration': int_or_none(clip.get('duration')),
'view_count': int_or_none(clip.get('playCount')),
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index 49d13460d..c731612c4 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -120,7 +120,7 @@ class KalturaIE(InfoExtractor):
def _extract_urls(webpage):
# Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
finditer = (
- re.finditer(
+ list(re.finditer(
r"""(?xs)
kWidget\.(?:thumb)?[Ee]mbed\(
\{.*?
@@ -128,8 +128,8 @@ class KalturaIE(InfoExtractor):
(?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
(?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s*
(?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
- """, webpage)
- or re.finditer(
+ """, webpage))
+ or list(re.finditer(
r'''(?xs)
(?P<q1>["'])
(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
@@ -142,16 +142,16 @@ class KalturaIE(InfoExtractor):
\[\s*(?P<q2_1>["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s*
)
(?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
- ''', webpage)
- or re.finditer(
+ ''', webpage))
+ or list(re.finditer(
r'''(?xs)
- <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])
+ <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])\s*
(?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)
(?:(?!(?P=q1)).)*
[?&;]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+)
(?:(?!(?P=q1)).)*
(?P=q1)
- ''', webpage)
+ ''', webpage))
)
urls = []
for mobj in finditer:
diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py
index 61739efa7..87e520378 100644
--- a/youtube_dl/extractor/khanacademy.py
+++ b/youtube_dl/extractor/khanacademy.py
@@ -1,82 +1,107 @@
from __future__ import unicode_literals
-import re
+import json
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
+ int_or_none,
+ parse_iso8601,
+ try_get,
)
-class KhanAcademyIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P<key>[^/]+)/(?:[^/]+/){,2}(?P<id>[^?#/]+)(?:$|[?#])'
- IE_NAME = 'KhanAcademy'
+class KhanAcademyBaseIE(InfoExtractor):
+ _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P<id>(?:[^/]+/){%s}%s[^?#/&]+)'
- _TESTS = [{
- 'url': 'http://www.khanacademy.org/video/one-time-pad',
- 'md5': '7b391cce85e758fb94f763ddc1bbb979',
+ def _parse_video(self, video):
+ return {
+ '_type': 'url_transparent',
+ 'url': video['youtubeId'],
+ 'id': video.get('slug'),
+ 'title': video.get('title'),
+ 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'),
+ 'duration': int_or_none(video.get('duration')),
+ 'description': video.get('description'),
+ 'ie_key': 'Youtube',
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ component_props = self._parse_json(self._download_json(
+ 'https://www.khanacademy.org/api/internal/graphql',
+ display_id, query={
+ 'hash': 1604303425,
+ 'variables': json.dumps({
+ 'path': display_id,
+ 'queryParams': '',
+ }),
+ })['data']['contentJson'], display_id)['componentProps']
+ return self._parse_component_props(component_props)
+
+
+class KhanAcademyIE(KhanAcademyBaseIE):
+ IE_NAME = 'khanacademy'
+ _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/')
+ _TEST = {
+ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad',
+ 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0',
'info_dict': {
- 'id': 'one-time-pad',
- 'ext': 'webm',
+ 'id': 'FlIG3TvQCBQ',
+ 'ext': 'mp4',
'title': 'The one-time pad',
'description': 'The perfect cipher',
'duration': 176,
'uploader': 'Brit Cruise',
'uploader_id': 'khanacademy',
'upload_date': '20120411',
+ 'timestamp': 1334170113,
+ 'license': 'cc-by-nc-sa',
},
'add_ie': ['Youtube'],
- }, {
- 'url': 'https://www.khanacademy.org/math/applied-math/cryptography',
+ }
+
+ def _parse_component_props(self, component_props):
+ video = component_props['tutorialPageData']['contentModel']
+ info = self._parse_video(video)
+ author_names = video.get('authorNames')
+ info.update({
+ 'uploader': ', '.join(author_names) if author_names else None,
+ 'timestamp': parse_iso8601(video.get('dateAdded')),
+ 'license': video.get('kaUserLicense'),
+ })
+ return info
+
+
+class KhanAcademyUnitIE(KhanAcademyBaseIE):
+ IE_NAME = 'khanacademy:unit'
+ _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)'
+ _TEST = {
+ 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography',
'info_dict': {
'id': 'cryptography',
- 'title': 'Journey into cryptography',
+ 'title': 'Cryptography',
'description': 'How have humans protected their secret messages through history? What has changed today?',
},
- 'playlist_mincount': 3,
- }]
-
- def _real_extract(self, url):
- m = re.match(self._VALID_URL, url)
- video_id = m.group('id')
+ 'playlist_mincount': 31,
+ }
- if m.group('key') == 'video':
- data = self._download_json(
- 'http://api.khanacademy.org/api/v1/videos/' + video_id,
- video_id, 'Downloading video info')
+ def _parse_component_props(self, component_props):
+ curation = component_props['curation']
- upload_date = unified_strdate(data['date_added'])
- uploader = ', '.join(data['author_names'])
- return {
- '_type': 'url_transparent',
- 'url': data['url'],
- 'id': video_id,
- 'title': data['title'],
- 'thumbnail': data['image_url'],
- 'duration': data['duration'],
- 'description': data['description'],
- 'uploader': uploader,
- 'upload_date': upload_date,
+ entries = []
+ tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or []
+ for tutorial_number, tutorial in enumerate(tutorials, 1):
+ chapter_info = {
+ 'chapter': tutorial.get('title'),
+ 'chapter_number': tutorial_number,
+ 'chapter_id': tutorial.get('id'),
}
- else:
- # topic
- data = self._download_json(
- 'http://api.khanacademy.org/api/v1/topic/' + video_id,
- video_id, 'Downloading topic info')
+ for content_item in (tutorial.get('contentItems') or []):
+ if content_item.get('kind') == 'Video':
+ info = self._parse_video(content_item)
+ info.update(chapter_info)
+ entries.append(info)
- entries = [
- {
- '_type': 'url',
- 'url': c['url'],
- 'id': c['id'],
- 'title': c['title'],
- }
- for c in data['children'] if c['kind'] in ('Video', 'Topic')]
-
- return {
- '_type': 'playlist',
- 'id': video_id,
- 'title': data['title'],
- 'description': data['description'],
- 'entries': entries,
- }
+ return self.playlist_result(
+ entries, curation.get('unit'), curation.get('title'),
+ curation.get('description'))
diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py
index 41cc245eb..cfd6b8393 100644
--- a/youtube_dl/extractor/lbry.py
+++ b/youtube_dl/extractor/lbry.py
@@ -5,7 +5,12 @@ import functools
import json
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+ compat_parse_qs,
+ compat_str,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
determine_ext,
ExtractorError,
@@ -57,6 +62,7 @@ class LBRYBaseIE(InfoExtractor):
'description': stream_value.get('description'),
'license': stream_value.get('license'),
'timestamp': int_or_none(stream.get('timestamp')),
+ 'release_timestamp': int_or_none(stream_value.get('release_time')),
'tags': stream_value.get('tags'),
'duration': int_or_none(media.get('duration')),
'channel': try_get(signing_channel, lambda x: x['value']['title']),
@@ -89,6 +95,8 @@ class LBRYIE(LBRYBaseIE):
'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
'timestamp': 1595694354,
'upload_date': '20200725',
+ 'release_timestamp': 1595340697,
+ 'release_date': '20200721',
'width': 1280,
'height': 720,
}
@@ -103,6 +111,8 @@ class LBRYIE(LBRYBaseIE):
'description': 'md5:661ac4f1db09f31728931d7b88807a61',
'timestamp': 1591312601,
'upload_date': '20200604',
+ 'release_timestamp': 1591312421,
+ 'release_date': '20200604',
'tags': list,
'duration': 2570,
'channel': 'The LBRY Foundation',
@@ -111,6 +121,26 @@ class LBRYIE(LBRYBaseIE):
'vcodec': 'none',
}
}, {
+ # HLS
+ 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e',
+ 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f',
+ 'info_dict': {
+ 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410',
+ 'ext': 'mp4',
+ 'title': 'PLANTS I WILL NEVER GROW AGAIN. THE BLACK LIST PLANTS FOR A CANADIAN GARDEN | Gardening in Canada 🍁',
+ 'description': 'md5:9c539c6a03fb843956de61a4d5288d5e',
+ 'timestamp': 1618254123,
+ 'upload_date': '20210412',
+ 'release_timestamp': 1618254002,
+ 'release_date': '20210412',
+ 'tags': list,
+ 'duration': 554,
+ 'channel': 'Gardening In Canada',
+ 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc',
+ 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc',
+ 'formats': 'mincount:3',
+ }
+ }, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True,
}, {
@@ -131,6 +161,9 @@ class LBRYIE(LBRYBaseIE):
}, {
'url': 'https://lbry.tv/$/download/Episode-1/e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
'only_matching': True,
+ }, {
+ 'url': 'https://lbry.tv/@lacajadepandora:a/TRUMP-EST%C3%81-BIEN-PUESTO-con-Pilar-Baselga,-Carlos-Senra,-Luis-Palacios-(720p_30fps_H264-192kbit_AAC):1',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -139,6 +172,7 @@ class LBRYIE(LBRYBaseIE):
display_id = display_id.split('/', 2)[-1].replace('/', ':')
else:
display_id = display_id.replace(':', '#')
+ display_id = compat_urllib_parse_unquote(display_id)
uri = 'lbry://' + display_id
result = self._resolve_url(uri, display_id, 'stream')
result_value = result['value']
@@ -149,10 +183,18 @@ class LBRYIE(LBRYBaseIE):
streaming_url = self._call_api_proxy(
'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url']
info = self._parse_stream(result, url)
+ urlh = self._request_webpage(
+ streaming_url, display_id, note='Downloading streaming redirect url info')
+ if determine_ext(urlh.geturl()) == 'm3u8':
+ info['formats'] = self._extract_m3u8_formats(
+ urlh.geturl(), display_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ self._sort_formats(info['formats'])
+ else:
+ info['url'] = streaming_url
info.update({
'id': claim_id,
'title': title,
- 'url': streaming_url,
})
return info
@@ -174,17 +216,18 @@ class LBRYChannelIE(LBRYBaseIE):
}]
_PAGE_SIZE = 50
- def _fetch_page(self, claim_id, url, page):
+ def _fetch_page(self, claim_id, url, params, page):
page += 1
+ page_params = {
+ 'channel_ids': [claim_id],
+ 'claim_type': 'stream',
+ 'no_totals': True,
+ 'page': page,
+ 'page_size': self._PAGE_SIZE,
+ }
+ page_params.update(params)
result = self._call_api_proxy(
- 'claim_search', claim_id, {
- 'channel_ids': [claim_id],
- 'claim_type': 'stream',
- 'no_totals': True,
- 'page': page,
- 'page_size': self._PAGE_SIZE,
- 'stream_types': self._SUPPORTED_STREAM_TYPES,
- }, 'page %d' % page)
+ 'claim_search', claim_id, page_params, 'page %d' % page)
for item in (result.get('items') or []):
stream_claim_name = item.get('name')
stream_claim_id = item.get('claim_id')
@@ -205,8 +248,31 @@ class LBRYChannelIE(LBRYBaseIE):
result = self._resolve_url(
'lbry://' + display_id, display_id, 'channel')
claim_id = result['claim_id']
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ content = qs.get('content', [None])[0]
+ params = {
+ 'fee_amount': qs.get('fee_amount', ['>=0'])[0],
+ 'order_by': {
+ 'new': ['release_time'],
+ 'top': ['effective_amount'],
+ 'trending': ['trending_group', 'trending_mixed'],
+ }[qs.get('order', ['new'])[0]],
+ 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES,
+ }
+ duration = qs.get('duration', [None])[0]
+ if duration:
+ params['duration'] = {
+ 'long': '>=1200',
+ 'short': '<=240',
+ }[duration]
+ language = qs.get('language', ['all'])[0]
+ if language != 'all':
+ languages = [language]
+ if language == 'en':
+ languages.append('none')
+ params['any_languages'] = languages
entries = OnDemandPagedList(
- functools.partial(self._fetch_page, claim_id, url),
+ functools.partial(self._fetch_page, claim_id, url, params),
self._PAGE_SIZE)
result_value = result.get('value') or {}
return self.playlist_result(
diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py
index 7f5fa446e..2526daa77 100644
--- a/youtube_dl/extractor/line.py
+++ b/youtube_dl/extractor/line.py
@@ -4,7 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import js_to_json
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ js_to_json,
+ str_or_none,
+)
class LineTVIE(InfoExtractor):
@@ -88,3 +94,137 @@ class LineTVIE(InfoExtractor):
for thumbnail in video_info.get('thumbnails', {}).get('list', [])],
'view_count': video_info.get('meta', {}).get('count'),
}
+
+
+class LineLiveBaseIE(InfoExtractor):
+ _API_BASE_URL = 'https://live-api.line-apps.com/web/v4.0/channel/'
+
+ def _parse_broadcast_item(self, item):
+ broadcast_id = compat_str(item['id'])
+ title = item['title']
+ is_live = item.get('isBroadcastingNow')
+
+ thumbnails = []
+ for thumbnail_id, thumbnail_url in (item.get('thumbnailURLs') or {}).items():
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ })
+
+ channel = item.get('channel') or {}
+ channel_id = str_or_none(channel.get('id'))
+
+ return {
+ 'id': broadcast_id,
+ 'title': self._live_title(title) if is_live else title,
+ 'thumbnails': thumbnails,
+ 'timestamp': int_or_none(item.get('createdAt')),
+ 'channel': channel.get('name'),
+ 'channel_id': channel_id,
+ 'channel_url': 'https://live.line.me/channels/' + channel_id if channel_id else None,
+ 'duration': int_or_none(item.get('archiveDuration')),
+ 'view_count': int_or_none(item.get('viewerCount')),
+ 'comment_count': int_or_none(item.get('chatCount')),
+ 'is_live': is_live,
+ }
+
+
+class LineLiveIE(LineLiveBaseIE):
+ _VALID_URL = r'https?://live\.line\.me/channels/(?P<channel_id>\d+)/broadcast/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://live.line.me/channels/4867368/broadcast/16331360',
+ 'md5': 'bc931f26bf1d4f971e3b0982b3fab4a3',
+ 'info_dict': {
+ 'id': '16331360',
+ 'title': '振りコピ講座😙😙😙',
+ 'ext': 'mp4',
+ 'timestamp': 1617095132,
+ 'upload_date': '20210330',
+ 'channel': '白川ゆめか',
+ 'channel_id': '4867368',
+ 'view_count': int,
+ 'comment_count': int,
+ 'is_live': False,
+ }
+ }, {
+ # archiveStatus == 'DELETED'
+ 'url': 'https://live.line.me/channels/4778159/broadcast/16378488',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ channel_id, broadcast_id = re.match(self._VALID_URL, url).groups()
+ broadcast = self._download_json(
+ self._API_BASE_URL + '%s/broadcast/%s' % (channel_id, broadcast_id),
+ broadcast_id)
+ item = broadcast['item']
+ info = self._parse_broadcast_item(item)
+ protocol = 'm3u8' if info['is_live'] else 'm3u8_native'
+ formats = []
+ for k, v in (broadcast.get(('live' if info['is_live'] else 'archived') + 'HLSURLs') or {}).items():
+ if not v:
+ continue
+ if k == 'abr':
+ formats.extend(self._extract_m3u8_formats(
+ v, broadcast_id, 'mp4', protocol,
+ m3u8_id='hls', fatal=False))
+ continue
+ f = {
+ 'ext': 'mp4',
+ 'format_id': 'hls-' + k,
+ 'protocol': protocol,
+ 'url': v,
+ }
+ if not k.isdigit():
+ f['vcodec'] = 'none'
+ formats.append(f)
+ if not formats:
+ archive_status = item.get('archiveStatus')
+ if archive_status != 'ARCHIVED':
+ raise ExtractorError('this video has been ' + archive_status.lower(), expected=True)
+ self._sort_formats(formats)
+ info['formats'] = formats
+ return info
+
+
+class LineLiveChannelIE(LineLiveBaseIE):
+ _VALID_URL = r'https?://live\.line\.me/channels/(?P<id>\d+)(?!/broadcast/\d+)(?:[/?&#]|$)'
+ _TEST = {
+ 'url': 'https://live.line.me/channels/5893542',
+ 'info_dict': {
+ 'id': '5893542',
+ 'title': 'いくらちゃん',
+ 'description': 'md5:c3a4af801f43b2fac0b02294976580be',
+ },
+ 'playlist_mincount': 29
+ }
+
+ def _archived_broadcasts_entries(self, archived_broadcasts, channel_id):
+ while True:
+ for row in (archived_broadcasts.get('rows') or []):
+ share_url = str_or_none(row.get('shareURL'))
+ if not share_url:
+ continue
+ info = self._parse_broadcast_item(row)
+ info.update({
+ '_type': 'url',
+ 'url': share_url,
+ 'ie_key': LineLiveIE.ie_key(),
+ })
+ yield info
+ if not archived_broadcasts.get('hasNextPage'):
+ return
+ archived_broadcasts = self._download_json(
+ self._API_BASE_URL + channel_id + '/archived_broadcasts',
+ channel_id, query={
+ 'lastId': info['id'],
+ })
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ channel = self._download_json(self._API_BASE_URL + channel_id, channel_id)
+ return self.playlist_result(
+ self._archived_broadcasts_entries(channel.get('archivedBroadcasts') or {}, channel_id),
+ channel_id, channel.get('title'), channel.get('information'))
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
deleted file mode 100644
index 4ac437c8b..000000000
--- a/youtube_dl/extractor/liveleak.py
+++ /dev/null
@@ -1,191 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class LiveLeakIE(InfoExtractor):
- _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P<id>[\w_]+)'
- _TESTS = [{
- 'url': 'http://www.liveleak.com/view?i=757_1364311680',
- 'md5': '0813c2430bea7a46bf13acf3406992f4',
- 'info_dict': {
- 'id': '757_1364311680',
- 'ext': 'mp4',
- 'description': 'extremely bad day for this guy..!',
- 'uploader': 'ljfriel2',
- 'title': 'Most unlucky car accident',
- 'thumbnail': r're:^https?://.*\.jpg$'
- }
- }, {
- 'url': 'http://www.liveleak.com/view?i=f93_1390833151',
- 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
- 'info_dict': {
- 'id': 'f93_1390833151',
- 'ext': 'mp4',
- 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
- 'uploader': 'ARD_Stinkt',
- 'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
- 'thumbnail': r're:^https?://.*\.jpg$'
- }
- }, {
- # Prochan embed
- 'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
- 'md5': '42c6d97d54f1db107958760788c5f48f',
- 'info_dict': {
- 'id': '4f7_1392687779',
- 'ext': 'mp4',
- 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.",
- 'uploader': 'CapObveus',
- 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
- 'age_limit': 18,
- },
- 'skip': 'Video is dead',
- }, {
- # Covers https://github.com/ytdl-org/youtube-dl/pull/5983
- # Multiple resolutions
- 'url': 'http://www.liveleak.com/view?i=801_1409392012',
- 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
- 'info_dict': {
- 'id': '801_1409392012',
- 'ext': 'mp4',
- 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
- 'uploader': 'bony333',
- 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia',
- 'thumbnail': r're:^https?://.*\.jpg$'
- }
- }, {
- # Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521
- 'url': 'http://m.liveleak.com/view?i=763_1473349649',
- 'add_ie': ['Youtube'],
- 'info_dict': {
- 'id': '763_1473349649',
- 'ext': 'mp4',
- 'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty',
- 'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.',
- 'uploader': 'Ziz',
- 'upload_date': '20160908',
- 'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw'
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'https://www.liveleak.com/view?i=677_1439397581',
- 'info_dict': {
- 'id': '677_1439397581',
- 'title': 'Fuel Depot in China Explosion caught on video',
- },
- 'playlist_count': 3,
- }, {
- 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227',
- 'only_matching': True,
- }, {
- # No original video
- 'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804',
- 'only_matching': True,
- }]
-
- @staticmethod
- def _extract_urls(webpage):
- return re.findall(
- r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"',
- webpage)
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
- video_description = self._og_search_description(webpage)
- video_uploader = self._html_search_regex(
- r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
- age_limit = int_or_none(self._search_regex(
- r'you confirm that you are ([0-9]+) years and over.',
- webpage, 'age limit', default=None))
- video_thumbnail = self._og_search_thumbnail(webpage)
-
- entries = self._parse_html5_media_entries(url, webpage, video_id)
- if not entries:
- # Maybe an embed?
- embed_url = self._search_regex(
- r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
- webpage, 'embed URL')
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- }
-
- for idx, info_dict in enumerate(entries):
- formats = []
- for a_format in info_dict['formats']:
- if not a_format.get('height'):
- a_format['height'] = int_or_none(self._search_regex(
- r'([0-9]+)p\.mp4', a_format['url'], 'height label',
- default=None))
- formats.append(a_format)
-
- # Removing '.*.mp4' gives the raw video, which is essentially
- # the same video without the LiveLeak logo at the top (see
- # https://github.com/ytdl-org/youtube-dl/pull/4768)
- orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url'])
- if a_format['url'] != orig_url:
- format_id = a_format.get('format_id')
- format_id = 'original' + ('-' + format_id if format_id else '')
- if self._is_valid_url(orig_url, video_id, format_id):
- formats.append({
- 'format_id': format_id,
- 'url': orig_url,
- 'preference': 1,
- })
- self._sort_formats(formats)
- info_dict['formats'] = formats
-
- # Don't append entry ID for one-video pages to keep backward compatibility
- if len(entries) > 1:
- info_dict['id'] = '%s_%s' % (video_id, idx + 1)
- else:
- info_dict['id'] = video_id
-
- info_dict.update({
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- 'thumbnail': video_thumbnail,
- })
-
- return self.playlist_result(entries, video_id, video_title)
-
-
-class LiveLeakEmbedIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[ift])=(?P<id>[\w_]+)'
-
- # See generic.py for actual test cases
- _TESTS = [{
- 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191',
- 'only_matching': True,
- }, {
- 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- kind, video_id = re.match(self._VALID_URL, url).groups()
-
- if kind == 'f':
- webpage = self._download_webpage(url, video_id)
- liveleak_url = self._search_regex(
- r'(?:logourl\s*:\s*|window\.open\()(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
- webpage, 'LiveLeak URL', group='url')
- else:
- liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id)
-
- return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())
diff --git a/youtube_dl/extractor/maoritv.py b/youtube_dl/extractor/maoritv.py
new file mode 100644
index 000000000..0d23fec75
--- /dev/null
+++ b/youtube_dl/extractor/maoritv.py
@@ -0,0 +1,31 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MaoriTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?maoritelevision\.com/shows/(?:[^/]+/)+(?P<id>[^/?&#]+)'
+ _TEST = {
+ 'url': 'https://www.maoritelevision.com/shows/korero-mai/S01E054/korero-mai-series-1-episode-54',
+ 'md5': '5ade8ef53851b6a132c051b1cd858899',
+ 'info_dict': {
+ 'id': '4774724855001',
+ 'ext': 'mp4',
+ 'title': 'Kōrero Mai, Series 1 Episode 54',
+ 'upload_date': '20160226',
+ 'timestamp': 1456455018,
+ 'description': 'md5:59bde32fd066d637a1a55794c56d8dcb',
+ 'uploader_id': '1614493167001',
+ },
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1614493167001/HJlhIQhQf_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ brightcove_id = self._search_regex(
+ r'data-main-video-id=["\'](\d+)', webpage, 'brightcove id')
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ 'BrightcoveNew', brightcove_id)
diff --git a/youtube_dl/extractor/medaltv.py b/youtube_dl/extractor/medaltv.py
index 1603b55f6..67bb4debb 100644
--- a/youtube_dl/extractor/medaltv.py
+++ b/youtube_dl/extractor/medaltv.py
@@ -15,33 +15,39 @@ from ..utils import (
class MedalTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr',
+ 'url': 'https://medal.tv/clips/2mA60jWAGQCBH',
'md5': '7b07b064331b1cf9e8e5c52a06ae68fa',
'info_dict': {
- 'id': '34934644',
+ 'id': '2mA60jWAGQCBH',
'ext': 'mp4',
'title': 'Quad Cold',
'description': 'Medal,https://medal.tv/desktop/',
'uploader': 'MowgliSB',
'timestamp': 1603165266,
'upload_date': '20201020',
- 'uploader_id': 10619174,
+ 'uploader_id': '10619174',
}
}, {
- 'url': 'https://medal.tv/clips/36787208',
+ 'url': 'https://medal.tv/clips/2um24TWdty0NA',
'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148',
'info_dict': {
- 'id': '36787208',
+ 'id': '2um24TWdty0NA',
'ext': 'mp4',
'title': 'u tk me i tk u bigger',
'description': 'Medal,https://medal.tv/desktop/',
'uploader': 'Mimicc',
'timestamp': 1605580939,
'upload_date': '20201117',
- 'uploader_id': 5156321,
+ 'uploader_id': '5156321',
}
+ }, {
+ 'url': 'https://medal.tv/clips/37rMeFpryCC-9',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://medal.tv/clips/2WRj40tpY_EU9',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py
index 50d5db802..788acf7fb 100644
--- a/youtube_dl/extractor/medialaan.py
+++ b/youtube_dl/extractor/medialaan.py
@@ -2,268 +2,113 @@ from __future__ import unicode_literals
import re
-from .gigya import GigyaBaseIE
-
-from ..compat import compat_str
+from .common import InfoExtractor
from ..utils import (
+ extract_attributes,
int_or_none,
- parse_duration,
- try_get,
- unified_timestamp,
+ mimetype2ext,
+ parse_iso8601,
)
-class MedialaanIE(GigyaBaseIE):
+class MedialaanIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
- (?:www\.|nieuws\.)?
(?:
- (?P<site_id>vtm|q2|vtmkzoom)\.be/
- (?:
- video(?:/[^/]+/id/|/?\?.*?\baid=)|
- (?:[^/]+/)*
- )
+ (?:embed\.)?mychannels.video/embed/|
+ embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/|
+ (?:www\.)?(?:
+ (?:
+ 7sur7|
+ demorgen|
+ hln|
+ joe|
+ qmusic
+ )\.be|
+ (?:
+ [abe]d|
+ bndestem|
+ destentor|
+ gelderlander|
+ pzc|
+ tubantia|
+ volkskrant
+ )\.nl
+ )/video/(?:[^/]+/)*[^/?&#]+~p
)
- (?P<id>[^/?#&]+)
+ (?P<id>\d+)
'''
- _NETRC_MACHINE = 'medialaan'
- _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-'
- _SITE_TO_APP_ID = {
- 'vtm': 'vtm_watch',
- 'q2': 'q2',
- 'vtmkzoom': 'vtmkzoom',
- }
_TESTS = [{
- # vod
- 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch',
+ 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993',
'info_dict': {
- 'id': 'vtm_20170219_VM0678361_vtmwatch',
+ 'id': '193993',
'ext': 'mp4',
- 'title': 'Allemaal Chris afl. 6',
- 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2',
- 'timestamp': 1487533280,
- 'upload_date': '20170219',
- 'duration': 2562,
- 'series': 'Allemaal Chris',
- 'season': 'Allemaal Chris',
- 'season_number': 1,
- 'season_id': '256936078124527',
- 'episode': 'Allemaal Chris afl. 6',
- 'episode_number': 6,
- 'episode_id': '256936078591527',
+ 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?',
+ 'timestamp': 1611663540,
+ 'upload_date': '20210126',
+ 'duration': 238,
},
'params': {
'skip_download': True,
},
- 'skip': 'Requires account credentials',
- }, {
- # clip
- 'url': 'http://vtm.be/video?aid=168332',
- 'info_dict': {
- 'id': '168332',
- 'ext': 'mp4',
- 'title': '"Veronique liegt!"',
- 'description': 'md5:1385e2b743923afe54ba4adc38476155',
- 'timestamp': 1489002029,
- 'upload_date': '20170308',
- 'duration': 96,
- },
}, {
- # vod
- 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000',
+ 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093',
'only_matching': True,
}, {
- # vod
- 'url': 'http://vtm.be/video?aid=163157',
+ 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default',
'only_matching': True,
}, {
- # vod
- 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2',
+ 'url': 'https://embed.mychannels.video/script/production/193993',
'only_matching': True,
}, {
- # clip
- 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio',
+ 'url': 'https://embed.mychannels.video/production/193993',
'only_matching': True,
}, {
- # http/s redirect
- 'url': 'https://vtmkzoom.be/video?aid=45724',
- 'info_dict': {
- 'id': '257136373657000',
- 'ext': 'mp4',
- 'title': 'K3 Dansstudio Ushuaia afl.6',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Requires account credentials',
+ 'url': 'https://mychannels.video/embed/193993',
+ 'only_matching': True,
}, {
- # nieuws.vtm.be
- 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma',
+ 'url': 'https://embed.mychannels.video/embed/193993',
'only_matching': True,
}]
- def _real_initialize(self):
- self._logged_in = False
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- self.raise_login_required()
-
- auth_data = {
- 'APIKey': self._APIKEY,
- 'sdk': 'js_6.1',
- 'format': 'json',
- 'loginID': username,
- 'password': password,
- }
-
- auth_info = self._gigya_login(auth_data)
-
- self._uid = auth_info['UID']
- self._uid_signature = auth_info['UIDSignature']
- self._signature_timestamp = auth_info['signatureTimestamp']
-
- self._logged_in = True
+ @staticmethod
+ def _extract_urls(webpage):
+ entries = []
+ for element in re.findall(r'(<div[^>]+data-mychannels-type="video"[^>]*>)', webpage):
+ mychannels_id = extract_attributes(element).get('data-mychannels-id')
+ if mychannels_id:
+ entries.append('https://mychannels.video/embed/' + mychannels_id)
+ return entries
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id, site_id = mobj.group('id', 'site_id')
-
- webpage = self._download_webpage(url, video_id)
-
- config = self._parse_json(
- self._search_regex(
- r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);',
- webpage, 'config', default='{}'), video_id,
- transform_source=lambda s: s.replace(
- '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'"))
-
- vod_id = config.get('vodId') or self._search_regex(
- (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"',
- r'"vodId"\s*:\s*"(.+?)"',
- r'<[^>]+id=["\']vod-(\d+)'),
- webpage, 'video_id', default=None)
-
- # clip, no authentication required
- if not vod_id:
- player = self._parse_json(
- self._search_regex(
- r'vmmaplayer\(({.+?})\);', webpage, 'vmma player',
- default=''),
- video_id, transform_source=lambda s: '[%s]' % s, fatal=False)
- if player:
- video = player[-1]
- if video['videoUrl'] in ('http', 'https'):
- return self.url_result(video['url'], MedialaanIE.ie_key())
- info = {
- 'id': video_id,
- 'url': video['videoUrl'],
- 'title': video['title'],
- 'thumbnail': video.get('imageUrl'),
- 'timestamp': int_or_none(video.get('createdDate')),
- 'duration': int_or_none(video.get('duration')),
- }
+ production_id = self._match_id(url)
+ production = self._download_json(
+ 'https://embed.mychannels.video/sdk/production/' + production_id,
+ production_id, query={'options': 'UUUU_default'})['productions'][0]
+ title = production['title']
+
+ formats = []
+ for source in (production.get('sources') or []):
+ src = source.get('src')
+ if not src:
+ continue
+ ext = mimetype2ext(source.get('type'))
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, production_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
else:
- info = self._parse_html5_media_entries(
- url, webpage, video_id, m3u8_id='hls')[0]
- info.update({
- 'id': video_id,
- 'title': self._html_search_meta('description', webpage),
- 'duration': parse_duration(self._html_search_meta('duration', webpage)),
- })
- # vod, authentication required
- else:
- if not self._logged_in:
- self._login()
-
- settings = self._parse_json(
- self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings', default='{}'),
- video_id)
-
- def get(container, item):
- return try_get(
- settings, lambda x: x[container][item],
- compat_str) or self._search_regex(
- r'"%s"\s*:\s*"([^"]+)' % item, webpage, item,
- default=None)
-
- app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch')
- sso = get('vod', 'gigyaDatabase') or 'vtm-sso'
-
- data = self._download_json(
- 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id,
- video_id, query={
- 'app_id': app_id,
- 'user_network': sso,
- 'UID': self._uid,
- 'UIDSignature': self._uid_signature,
- 'signatureTimestamp': self._signature_timestamp,
+ formats.append({
+ 'ext': ext,
+ 'url': src,
})
-
- formats = self._extract_m3u8_formats(
- data['response']['uri'], video_id, entry_protocol='m3u8_native',
- ext='mp4', m3u8_id='hls')
-
- self._sort_formats(formats)
-
- info = {
- 'id': vod_id,
- 'formats': formats,
- }
-
- api_key = get('vod', 'apiKey')
- channel = get('medialaanGigya', 'channel')
-
- if api_key:
- videos = self._download_json(
- 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False,
- query={
- 'channels': channel,
- 'ids': vod_id,
- 'limit': 1,
- 'apikey': api_key,
- })
- if videos:
- video = try_get(
- videos, lambda x: x['response']['videos'][0], dict)
- if video:
- def get(container, item, expected_type=None):
- return try_get(
- video, lambda x: x[container][item], expected_type)
-
- def get_string(container, item):
- return get(container, item, compat_str)
-
- info.update({
- 'series': get_string('program', 'title'),
- 'season': get_string('season', 'title'),
- 'season_number': int_or_none(get('season', 'number')),
- 'season_id': get_string('season', 'id'),
- 'episode': get_string('episode', 'title'),
- 'episode_number': int_or_none(get('episode', 'number')),
- 'episode_id': get_string('episode', 'id'),
- 'duration': int_or_none(
- video.get('duration')) or int_or_none(
- video.get('durationMillis'), scale=1000),
- 'title': get_string('episode', 'title'),
- 'description': get_string('episode', 'text'),
- 'timestamp': unified_timestamp(get_string(
- 'publication', 'begin')),
- })
-
- if not info.get('title'):
- info['title'] = try_get(
- config, lambda x: x['videoConfig']['title'],
- compat_str) or self._html_search_regex(
- r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title',
- default=None) or self._og_search_title(webpage)
-
- if not info.get('description'):
- info['description'] = self._html_search_regex(
- r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>',
- webpage, 'description', default=None)
-
- return info
+ self._sort_formats(formats)
+
+ return {
+ 'id': production_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': production.get('posterUrl'),
+ 'timestamp': parse_iso8601(production.get('publicationDate'), ' '),
+ 'duration': int_or_none(production.get('duration')) or None,
+ }
diff --git a/youtube_dl/extractor/minds.py b/youtube_dl/extractor/minds.py
new file mode 100644
index 000000000..8e9f0f825
--- /dev/null
+++ b/youtube_dl/extractor/minds.py
@@ -0,0 +1,196 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ clean_html,
+ int_or_none,
+ str_or_none,
+ strip_or_none,
+)
+
+
+class MindsBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/'
+
+ def _call_api(self, path, video_id, resource, query=None):
+ api_url = 'https://www.minds.com/api/' + path
+ token = self._get_cookies(api_url).get('XSRF-TOKEN')
+ return self._download_json(
+ api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={
+ 'Referer': 'https://www.minds.com/',
+ 'X-XSRF-TOKEN': token.value if token else '',
+ }, query=query)
+
+
+class MindsIE(MindsBaseIE):
+ IE_NAME = 'minds'
+ _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://www.minds.com/media/100000000000086822',
+ 'md5': '215a658184a419764852239d4970b045',
+ 'info_dict': {
+ 'id': '100000000000086822',
+ 'ext': 'mp4',
+ 'title': 'Minds intro sequence',
+ 'thumbnail': r're:https?://.+\.png',
+ 'uploader_id': 'ottman',
+ 'upload_date': '20130524',
+ 'timestamp': 1369404826,
+ 'uploader': 'Bill Ottman',
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'tags': ['animation'],
+ 'comment_count': int,
+ 'license': 'attribution-cc',
+ },
+ }, {
+ # entity.type == 'activity' and empty title
+ 'url': 'https://www.minds.com/newsfeed/798025111988506624',
+ 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950',
+ 'info_dict': {
+ 'id': '798022190320226304',
+ 'ext': 'mp4',
+ 'title': '798022190320226304',
+ 'uploader': 'ColinFlaherty',
+ 'upload_date': '20180111',
+ 'timestamp': 1515639316,
+ 'uploader_id': 'ColinFlaherty',
+ },
+ }, {
+ 'url': 'https://www.minds.com/archive/view/715172106794442752',
+ 'only_matching': True,
+ }, {
+ # youtube perma_url
+ 'url': 'https://www.minds.com/newsfeed/1197131838022602752',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ entity_id = self._match_id(url)
+ entity = self._call_api(
+ 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity']
+ if entity.get('type') == 'activity':
+ if entity.get('custom_type') == 'video':
+ video_id = entity['entity_guid']
+ else:
+ return self.url_result(entity['perma_url'])
+ else:
+ assert(entity['subtype'] == 'video')
+ video_id = entity_id
+ # 1080p and webm formats available only on the sources array
+ video = self._call_api(
+ 'v2/media/video/' + video_id, video_id, 'video')
+
+ formats = []
+ for source in (video.get('sources') or []):
+ src = source.get('src')
+ if not src:
+ continue
+ formats.append({
+ 'format_id': source.get('label'),
+ 'height': int_or_none(source.get('size')),
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ entity = video.get('entity') or entity
+ owner = entity.get('ownerObj') or {}
+ uploader_id = owner.get('username')
+
+ tags = entity.get('tags')
+ if tags and isinstance(tags, compat_str):
+ tags = [tags]
+
+ thumbnail = None
+ poster = video.get('poster') or entity.get('thumbnail_src')
+ if poster:
+ urlh = self._request_webpage(poster, video_id, fatal=False)
+ if urlh:
+ thumbnail = urlh.geturl()
+
+ return {
+ 'id': video_id,
+ 'title': entity.get('title') or video_id,
+ 'formats': formats,
+ 'description': clean_html(entity.get('description')) or None,
+ 'license': str_or_none(entity.get('license')),
+ 'timestamp': int_or_none(entity.get('time_created')),
+ 'uploader': strip_or_none(owner.get('name')),
+ 'uploader_id': uploader_id,
+ 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None,
+ 'view_count': int_or_none(entity.get('play:count')),
+ 'like_count': int_or_none(entity.get('thumbs:up:count')),
+ 'dislike_count': int_or_none(entity.get('thumbs:down:count')),
+ 'tags': tags,
+ 'comment_count': int_or_none(entity.get('comments:count')),
+ 'thumbnail': thumbnail,
+ }
+
+
+class MindsFeedBaseIE(MindsBaseIE):
+ _PAGE_SIZE = 150
+
+ def _entries(self, feed_id):
+ query = {'limit': self._PAGE_SIZE, 'sync': 1}
+ i = 1
+ while True:
+ data = self._call_api(
+ 'v2/feeds/container/%s/videos' % feed_id,
+ feed_id, 'page %s' % i, query)
+ entities = data.get('entities') or []
+ for entity in entities:
+ guid = entity.get('guid')
+ if not guid:
+ continue
+ yield self.url_result(
+ 'https://www.minds.com/newsfeed/' + guid,
+ MindsIE.ie_key(), guid)
+ query['from_timestamp'] = data['load-next']
+ if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE):
+ break
+ i += 1
+
+ def _real_extract(self, url):
+ feed_id = self._match_id(url)
+ feed = self._call_api(
+ 'v1/%s/%s' % (self._FEED_PATH, feed_id),
+ feed_id, self._FEED_TYPE)[self._FEED_TYPE]
+
+ return self.playlist_result(
+ self._entries(feed['guid']), feed_id,
+ strip_or_none(feed.get('name')),
+ feed.get('briefdescription'))
+
+
+class MindsChannelIE(MindsFeedBaseIE):
+ _FEED_TYPE = 'channel'
+ IE_NAME = 'minds:' + _FEED_TYPE
+ _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P<id>[^/?&#]+)'
+ _FEED_PATH = 'channel'
+ _TEST = {
+ 'url': 'https://www.minds.com/ottman',
+ 'info_dict': {
+ 'id': 'ottman',
+ 'title': 'Bill Ottman',
+ 'description': 'Co-creator & CEO @minds',
+ },
+ 'playlist_mincount': 54,
+ }
+
+
+class MindsGroupIE(MindsFeedBaseIE):
+ _FEED_TYPE = 'group'
+ IE_NAME = 'minds:' + _FEED_TYPE
+ _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P<id>[0-9]+)'
+ _FEED_PATH = 'groups/group'
+ _TEST = {
+ 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos',
+ 'info_dict': {
+ 'id': '785582576369672204',
+ 'title': 'Cooking Videos',
+ },
+ 'playlist_mincount': 1,
+ }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 9759560f1..69319857d 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -251,8 +251,11 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE):
cloudcast_url = cloudcast.get('url')
if not cloudcast_url:
continue
+ slug = try_get(cloudcast, lambda x: x['slug'], compat_str)
+ owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str)
+ video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None
entries.append(self.url_result(
- cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug')))
+ cloudcast_url, MixcloudIE.ie_key(), video_id))
page_info = items['pageInfo']
has_next_page = page_info['hasNextPage']
@@ -321,7 +324,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
_DESCRIPTION_KEY = 'biog'
_ROOT_TYPE = 'user'
_NODE_TEMPLATE = '''slug
- url'''
+ url
+ owner { username }'''
def _get_playlist_title(self, title, slug):
return '%s (%s)' % (title, slug)
@@ -345,6 +349,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
_NODE_TEMPLATE = '''cloudcast {
slug
url
+ owner { username }
}'''
def _get_cloudcast(self, node):
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index b907f6b49..b69301d97 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -1,15 +1,91 @@
from __future__ import unicode_literals
-from .nhl import NHLBaseIE
+import re
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+ try_get,
+)
-class MLBIE(NHLBaseIE):
+
+class MLBBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ video = self._download_video_data(display_id)
+ video_id = video['id']
+ title = video['title']
+ feed = self._get_feed(video)
+
+ formats = []
+ for playback in (feed.get('playbacks') or []):
+ playback_url = playback.get('url')
+ if not playback_url:
+ continue
+ name = playback.get('name')
+ ext = determine_ext(playback_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ playback_url, video_id, 'mp4',
+ 'm3u8_native', m3u8_id=name, fatal=False))
+ else:
+ f = {
+ 'format_id': name,
+ 'url': playback_url,
+ }
+ mobj = re.search(r'_(\d+)K_(\d+)X(\d+)', name)
+ if mobj:
+ f.update({
+ 'height': int(mobj.group(3)),
+ 'tbr': int(mobj.group(1)),
+ 'width': int(mobj.group(2)),
+ })
+ mobj = re.search(r'_(\d+)x(\d+)_(\d+)_(\d+)K\.mp4', playback_url)
+ if mobj:
+ f.update({
+ 'fps': int(mobj.group(3)),
+ 'height': int(mobj.group(2)),
+ 'tbr': int(mobj.group(4)),
+ 'width': int(mobj.group(1)),
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []):
+ src = cut.get('src')
+ if not src:
+ continue
+ thumbnails.append({
+ 'height': int_or_none(cut.get('height')),
+ 'url': src,
+ 'width': int_or_none(cut.get('width')),
+ })
+
+ language = (video.get('language') or 'EN').lower()
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': video.get('description'),
+ 'duration': parse_duration(feed.get('duration')),
+ 'thumbnails': thumbnails,
+ 'timestamp': parse_iso8601(video.get(self._TIMESTAMP_KEY)),
+ 'subtitles': self._extract_mlb_subtitles(feed, language),
+ }
+
+
+class MLBIE(MLBBaseIE):
_VALID_URL = r'''(?x)
https?://
- (?:[\da-z_-]+\.)*(?P<site>mlb)\.com/
+ (?:[\da-z_-]+\.)*mlb\.com/
(?:
(?:
- (?:[^/]+/)*c-|
+ (?:[^/]+/)*video/[^/]+/c-|
(?:
shared/video/embed/(?:embed|m-internal-embed)\.html|
(?:[^/]+/)+(?:play|index)\.jsp|
@@ -18,7 +94,6 @@ class MLBIE(NHLBaseIE):
(?P<id>\d+)
)
'''
- _CONTENT_DOMAIN = 'content.mlb.com'
_TESTS = [
{
'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933',
@@ -77,18 +152,6 @@ class MLBIE(NHLBaseIE):
},
},
{
- 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098',
- 'md5': 'e09e37b552351fddbf4d9e699c924d68',
- 'info_dict': {
- 'id': '75609783',
- 'ext': 'mp4',
- 'title': 'Must C: Pillar climbs for catch',
- 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
- 'timestamp': 1429139220,
- 'upload_date': '20150415',
- }
- },
- {
'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694',
'only_matching': True,
},
@@ -113,8 +176,92 @@ class MLBIE(NHLBaseIE):
'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb',
'only_matching': True,
},
- {
- 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842',
- 'only_matching': True,
- }
]
+ _TIMESTAMP_KEY = 'date'
+
+ @staticmethod
+ def _get_feed(video):
+ return video
+
+ @staticmethod
+ def _extract_mlb_subtitles(feed, language):
+ subtitles = {}
+ for keyword in (feed.get('keywordsAll') or []):
+ keyword_type = keyword.get('type')
+ if keyword_type and keyword_type.startswith('closed_captions_location_'):
+ cc_location = keyword.get('value')
+ if cc_location:
+ subtitles.setdefault(language, []).append({
+ 'url': cc_location,
+ })
+ return subtitles
+
+ def _download_video_data(self, display_id):
+ return self._download_json(
+ 'http://content.mlb.com/mlb/item/id/v1/%s/details/web-v1.json' % display_id,
+ display_id)
+
+
+class MLBVideoIE(MLBBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?mlb\.com/(?:[^/]+/)*video/(?P<id>[^/?&#]+)'
+ _TEST = {
+ 'url': 'https://www.mlb.com/mariners/video/ackley-s-spectacular-catch-c34698933',
+ 'md5': '632358dacfceec06bad823b83d21df2d',
+ 'info_dict': {
+ 'id': 'c04a8863-f569-42e6-9f87-992393657614',
+ 'ext': 'mp4',
+ 'title': "Ackley's spectacular catch",
+ 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0',
+ 'duration': 66,
+ 'timestamp': 1405995000,
+ 'upload_date': '20140722',
+ 'thumbnail': r're:^https?://.+',
+ },
+ }
+ _TIMESTAMP_KEY = 'timestamp'
+
+ @classmethod
+ def suitable(cls, url):
+ return False if MLBIE.suitable(url) else super(MLBVideoIE, cls).suitable(url)
+
+ @staticmethod
+ def _get_feed(video):
+ return video['feeds'][0]
+
+ @staticmethod
+ def _extract_mlb_subtitles(feed, language):
+ subtitles = {}
+ for cc_location in (feed.get('closedCaptions') or []):
+ subtitles.setdefault(language, []).append({
+ 'url': cc_location,
+ })
+
+ def _download_video_data(self, display_id):
+ # https://www.mlb.com/data-service/en/videos/[SLUG]
+ return self._download_json(
+ 'https://fastball-gateway.mlb.com/graphql',
+ display_id, query={
+ 'query': '''{
+ mediaPlayback(ids: "%s") {
+ description
+ feeds(types: CMS) {
+ closedCaptions
+ duration
+ image {
+ cuts {
+ width
+ height
+ src
+ }
+ }
+ playbacks {
+ name
+ url
+ }
+ }
+ id
+ timestamp
+ title
+ }
+}''' % display_id,
+ })['data']['mediaPlayback'][0]
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index df1034fc5..5a5205c0e 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -253,6 +253,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
return try_get(feed, lambda x: x['result']['data']['id'], compat_str)
+ @staticmethod
+ def _extract_child_with_type(parent, t):
+ for c in parent['children']:
+ if c.get('type') == t:
+ return c
+
def _extract_mgid(self, webpage):
try:
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
@@ -278,6 +284,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
if not mgid:
mgid = self._extract_triforce_mgid(webpage)
+ if not mgid:
+ data = self._parse_json(self._search_regex(
+ r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
+ main_container = self._extract_child_with_type(data, 'MainContainer')
+ ab_testing = self._extract_child_with_type(main_container, 'ABTesting')
+ video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
+ mgid = video_player['props']['media']['video']['config']['uri']
+
return mgid
def _real_extract(self, url):
@@ -309,7 +323,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media.mtvnservices.com/embed/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage)
if mobj:
return mobj.group('url')
@@ -349,18 +363,6 @@ class MTVIE(MTVServicesInfoExtractor):
'only_matching': True,
}]
- @staticmethod
- def extract_child_with_type(parent, t):
- children = parent['children']
- return next(c for c in children if c.get('type') == t)
-
- def _extract_mgid(self, webpage):
- data = self._parse_json(self._search_regex(
- r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
- main_container = self.extract_child_with_type(data, 'MainContainer')
- video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
- return video_player['props']['media']['video']['config']['uri']
-
class MTVJapanIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvjapan'
diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py
index a569c889e..cfc220314 100644
--- a/youtube_dl/extractor/ninecninemedia.py
+++ b/youtube_dl/extractor/ninecninemedia.py
@@ -23,11 +23,9 @@ class NineCNineMediaIE(InfoExtractor):
destination_code, content_id = re.match(self._VALID_URL, url).groups()
api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id)
content = self._download_json(api_base_url, content_id, query={
- '$include': '[Media,Season,ContentPackages]',
+ '$include': '[Media.Name,Season,ContentPackages.Duration,ContentPackages.Id]',
})
title = content['Name']
- if len(content['ContentPackages']) > 1:
- raise ExtractorError('multiple content packages')
content_package = content['ContentPackages'][0]
package_id = content_package['Id']
content_package_url = api_base_url + 'contentpackages/%s/' % package_id
diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py
index dc6a27d36..14390823b 100644
--- a/youtube_dl/extractor/ninegag.py
+++ b/youtube_dl/extractor/ninegag.py
@@ -1,104 +1,130 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import str_to_int
+from ..utils import (
+ ExtractorError,
+ determine_ext,
+ int_or_none,
+ try_get,
+ unescapeHTML,
+ url_or_none,
+)
class NineGagIE(InfoExtractor):
IE_NAME = '9gag'
- _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P<id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^?#/]+))?'
+ _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)'
_TESTS = [{
- 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome',
- 'info_dict': {
- 'id': 'kXzwOKyGlSA',
- 'ext': 'mp4',
- 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)',
- 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome',
- 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA',
- 'uploader': 'CompilationChannel',
- 'upload_date': '20131110',
- 'view_count': int,
- },
- 'add_ie': ['Youtube'],
- }, {
- 'url': 'http://9gag.com/tv/p/aKolP3',
+ 'url': 'https://9gag.com/gag/ae5Ag7B',
'info_dict': {
- 'id': 'aKolP3',
+ 'id': 'ae5Ag7B',
'ext': 'mp4',
- 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video',
- 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!",
- 'uploader_id': 'rickmereki',
- 'uploader': 'Rick Mereki',
- 'upload_date': '20110803',
- 'view_count': int,
- },
- 'add_ie': ['Vimeo'],
- }, {
- 'url': 'http://9gag.com/tv/p/KklwM',
- 'only_matching': True,
- }, {
- 'url': 'http://9gag.tv/p/Kk2X5',
- 'only_matching': True,
+ 'title': 'Capybara Agility Training',
+ 'upload_date': '20191108',
+ 'timestamp': 1573237208,
+ 'categories': ['Awesome'],
+ 'tags': ['Weimaraner', 'American Pit Bull Terrier'],
+ 'duration': 44,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ }
}, {
- 'url': 'http://9gag.com/tv/embed/a5Dmvl',
+ # HTML escaped title
+ 'url': 'https://9gag.com/gag/av5nvyb',
'only_matching': True,
}]
- _EXTERNAL_VIDEO_PROVIDER = {
- '1': {
- 'url': '%s',
- 'ie_key': 'Youtube',
- },
- '2': {
- 'url': 'http://player.vimeo.com/video/%s',
- 'ie_key': 'Vimeo',
- },
- '3': {
- 'url': 'http://instagram.com/p/%s',
- 'ie_key': 'Instagram',
- },
- '4': {
- 'url': 'http://vine.co/v/%s',
- 'ie_key': 'Vine',
- },
- }
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id') or video_id
+ post_id = self._match_id(url)
+ post = self._download_json(
+ 'https://9gag.com/v1/post', post_id, query={
+ 'id': post_id
+ })['data']['post']
+
+ if post.get('type') != 'Animated':
+ raise ExtractorError(
+ 'The given url does not contain a video',
+ expected=True)
+
+ title = unescapeHTML(post['title'])
+
+ duration = None
+ formats = []
+ thumbnails = []
+ for key, image in (post.get('images') or {}).items():
+ image_url = url_or_none(image.get('url'))
+ if not image_url:
+ continue
+ ext = determine_ext(image_url)
+ image_id = key.strip('image')
+ common = {
+ 'url': image_url,
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ }
+ if ext in ('jpg', 'png'):
+ webp_url = image.get('webpUrl')
+ if webp_url:
+ t = common.copy()
+ t.update({
+ 'id': image_id + '-webp',
+ 'url': webp_url,
+ })
+ thumbnails.append(t)
+ common.update({
+ 'id': image_id,
+ 'ext': ext,
+ })
+ thumbnails.append(common)
+ elif ext in ('webm', 'mp4'):
+ if not duration:
+ duration = int_or_none(image.get('duration'))
+ common['acodec'] = 'none' if image.get('hasAudio') == 0 else None
+ for vcodec in ('vp8', 'vp9', 'h265'):
+ c_url = image.get(vcodec + 'Url')
+ if not c_url:
+ continue
+ c_f = common.copy()
+ c_f.update({
+ 'format_id': image_id + '-' + vcodec,
+ 'url': c_url,
+ 'vcodec': vcodec,
+ })
+ formats.append(c_f)
+ common.update({
+ 'ext': ext,
+ 'format_id': image_id,
+ })
+ formats.append(common)
+ self._sort_formats(formats)
- webpage = self._download_webpage(url, display_id)
+ section = try_get(post, lambda x: x['postSection']['name'])
- post_view = self._parse_json(
- self._search_regex(
- r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost',
- webpage, 'post view'),
- display_id)
+ tags = None
+ post_tags = post.get('tags')
+ if post_tags:
+ tags = []
+ for tag in post_tags:
+ tag_key = tag.get('key')
+ if not tag_key:
+ continue
+ tags.append(tag_key)
- ie_key = None
- source_url = post_view.get('sourceUrl')
- if not source_url:
- external_video_id = post_view['videoExternalId']
- external_video_provider = post_view['videoExternalProvider']
- source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id
- ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key']
- title = post_view['title']
- description = post_view.get('description')
- view_count = str_to_int(post_view.get('externalView'))
- thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
+ get_count = lambda x: int_or_none(post.get(x + 'Count'))
return {
- '_type': 'url_transparent',
- 'url': source_url,
- 'ie_key': ie_key,
- 'id': video_id,
- 'display_id': display_id,
+ 'id': post_id,
'title': title,
- 'description': description,
- 'view_count': view_count,
- 'thumbnail': thumbnail,
+ 'timestamp': int_or_none(post.get('creationTs')),
+ 'duration': duration,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'like_count': get_count('upVote'),
+ 'dislike_count': get_count('downVote'),
+ 'comment_count': get_count('comments'),
+ 'age_limit': 18 if post.get('nsfw') == 1 else None,
+ 'categories': [section] if section else None,
+ 'tags': tags,
}
diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py
index 025c5d249..3639d142f 100644
--- a/youtube_dl/extractor/njpwworld.py
+++ b/youtube_dl/extractor/njpwworld.py
@@ -6,30 +6,40 @@ import re
from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
- extract_attributes,
get_element_by_class,
urlencode_postdata,
)
class NJPWWorldIE(InfoExtractor):
- _VALID_URL = r'https?://njpwworld\.com/p/(?P<id>[a-z0-9_]+)'
+ _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P<id>[a-z0-9_]+)'
IE_DESC = '新日本プロレスワールド'
_NETRC_MACHINE = 'njpwworld'
- _TEST = {
+ _TESTS = [{
'url': 'http://njpwworld.com/p/s_series_00155_1_9/',
'info_dict': {
'id': 's_series_00155_1_9',
'ext': 'mp4',
- 'title': '第9試合 ランディ・サベージ vs リック・スタイナー',
+ 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー',
'tags': list,
},
'params': {
'skip_download': True, # AES-encrypted m3u8
},
'skip': 'Requires login',
- }
+ }, {
+ 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs',
+ 'info_dict': {
+ 'id': 's_series_00563_16_bs',
+ 'ext': 'mp4',
+ 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)',
+ 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
_LOGIN_URL = 'https://front.njpwworld.com/auth/login'
@@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
formats = []
- for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage):
- player = extract_attributes(mobj.group(0))
- player_path = player.get('href')
- if not player_path:
- continue
- kind = self._search_regex(
- r'(low|high)$', player.get('class') or '', 'kind',
- default='low')
+ for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage):
+ player_path = '/intent?id=%s&type=url' % vid
player_url = compat_urlparse.urljoin(url, player_path)
- player_page = self._download_webpage(
- player_url, video_id, note='Downloading player page')
- entries = self._parse_html5_media_entries(
- player_url, player_page, video_id, m3u8_id='hls-%s' % kind,
- m3u8_entry_protocol='m3u8_native')
- kind_formats = entries[0]['formats']
- for f in kind_formats:
- f['quality'] = 2 if kind == 'high' else 1
- formats.extend(kind_formats)
+ formats.append({
+ 'url': player_url,
+ 'format_id': kind,
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ 'quality': 2 if kind == 'high' else 1,
+ })
self._sort_formats(formats)
- post_content = get_element_by_class('post-content', webpage)
+ tag_block = get_element_by_class('tag-block', webpage)
tags = re.findall(
- r'<li[^>]+class="tag-[^"]+"><a[^>]*>([^<]+)</a></li>', post_content
- ) if post_content else None
+ r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block
+ ) if tag_block else None
return {
'id': video_id,
- 'title': self._og_search_title(webpage),
+ 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage),
'formats': formats,
'tags': tags,
}
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 40dee2162..6d01a25c3 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -58,7 +58,7 @@ class NRKBaseIE(InfoExtractor):
def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None):
return self._download_json(
- urljoin('http://psapi.nrk.no/', path),
+ urljoin('https://psapi.nrk.no/', path),
video_id, note or 'Downloading %s JSON' % item,
fatal=fatal, query=query,
headers={'Accept-Encoding': 'gzip, deflate, br'})
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 700ce448c..8d537d7ae 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -98,6 +98,9 @@ class ORFTVthekIE(InfoExtractor):
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(
src, video_id, f4m_id=format_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id=format_id, fatal=False))
else:
formats.append({
'format_id': format_id,
@@ -140,6 +143,25 @@ class ORFTVthekIE(InfoExtractor):
})
upload_date = unified_strdate(sd.get('created_date'))
+
+ thumbnails = []
+ preview = sd.get('preview_image_url')
+ if preview:
+ thumbnails.append({
+ 'id': 'preview',
+ 'url': preview,
+ 'preference': 0,
+ })
+ image = sd.get('image_full_url')
+ if not image and len(data_jsb) == 1:
+ image = self._og_search_thumbnail(webpage)
+ if image:
+ thumbnails.append({
+ 'id': 'full',
+ 'url': image,
+ 'preference': 1,
+ })
+
entries.append({
'_type': 'video',
'id': video_id,
@@ -149,7 +171,7 @@ class ORFTVthekIE(InfoExtractor):
'description': sd.get('description'),
'duration': int_or_none(sd.get('duration_in_seconds')),
'upload_date': upload_date,
- 'thumbnail': sd.get('image_full_url'),
+ 'thumbnails': thumbnails,
})
return {
@@ -182,7 +204,7 @@ class ORFRadioIE(InfoExtractor):
duration = end - start if end and start else None
entries.append({
'id': loop_stream_id.replace('.mp3', ''),
- 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
+ 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
'title': title,
'description': clean_html(data.get('subtitle')),
'duration': duration,
diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py
new file mode 100644
index 000000000..fb29d83f9
--- /dev/null
+++ b/youtube_dl/extractor/palcomp3.py
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ try_get,
+)
+
+
+class PalcoMP3BaseIE(InfoExtractor):
+ _GQL_QUERY_TMPL = '''{
+ artist(slug: "%s") {
+ %s
+ }
+}'''
+ _ARTIST_FIELDS_TMPL = '''music(slug: "%%s") {
+ %s
+ }'''
+ _MUSIC_FIELDS = '''duration
+ hls
+ mp3File
+ musicID
+ plays
+ title'''
+
+ def _call_api(self, artist_slug, artist_fields):
+ return self._download_json(
+ 'https://www.palcomp3.com.br/graphql/', artist_slug, query={
+ 'query': self._GQL_QUERY_TMPL % (artist_slug, artist_fields),
+ })['data']
+
+ def _parse_music(self, music):
+ music_id = compat_str(music['musicID'])
+ title = music['title']
+
+ formats = []
+ hls_url = music.get('hls')
+ if hls_url:
+ formats.append({
+ 'url': hls_url,
+ 'protocol': 'm3u8_native',
+ 'ext': 'mp4',
+ })
+ mp3_file = music.get('mp3File')
+ if mp3_file:
+ formats.append({
+ 'url': mp3_file,
+ })
+
+ return {
+ 'id': music_id,
+ 'title': title,
+ 'formats': formats,
+ 'duration': int_or_none(music.get('duration')),
+ 'view_count': int_or_none(music.get('plays')),
+ }
+
+ def _real_initialize(self):
+ self._ARTIST_FIELDS_TMPL = self._ARTIST_FIELDS_TMPL % self._MUSIC_FIELDS
+
+ def _real_extract(self, url):
+ artist_slug, music_slug = re.match(self._VALID_URL, url).groups()
+ artist_fields = self._ARTIST_FIELDS_TMPL % music_slug
+ music = self._call_api(artist_slug, artist_fields)['artist']['music']
+ return self._parse_music(music)
+
+
+class PalcoMP3IE(PalcoMP3BaseIE):
+ IE_NAME = 'PalcoMP3:song'
+ _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/',
+ 'md5': '99fd6405b2d8fd589670f6db1ba3b358',
+ 'info_dict': {
+ 'id': '3162927',
+ 'ext': 'mp3',
+ 'title': 'Nossas Composições - CUIDA BEM DELA',
+ 'duration': 210,
+ 'view_count': int,
+ }
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if PalcoMP3VideoIE.suitable(url) else super(PalcoMP3IE, cls).suitable(url)
+
+
+class PalcoMP3ArtistIE(PalcoMP3BaseIE):
+ IE_NAME = 'PalcoMP3:artist'
+ _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<id>[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://www.palcomp3.com.br/condedoforro/',
+ 'info_dict': {
+ 'id': '358396',
+ 'title': 'Conde do Forró',
+ },
+ 'playlist_mincount': 188,
+ }]
+ _ARTIST_FIELDS_TMPL = '''artistID
+ musics {
+ nodes {
+ %s
+ }
+ }
+ name'''
+
+ @ classmethod
+ def suitable(cls, url):
+ return False if re.match(PalcoMP3IE._VALID_URL, url) else super(PalcoMP3ArtistIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ artist_slug = self._match_id(url)
+ artist = self._call_api(artist_slug, self._ARTIST_FIELDS_TMPL)['artist']
+
+ def entries():
+ for music in (try_get(artist, lambda x: x['musics']['nodes'], list) or []):
+ yield self._parse_music(music)
+
+ return self.playlist_result(
+ entries(), str_or_none(artist.get('artistID')), artist.get('name'))
+
+
+class PalcoMP3VideoIE(PalcoMP3BaseIE):
+ IE_NAME = 'PalcoMP3:video'
+ _VALID_URL = r'https?://(?:www\.)?palcomp3\.com(?:\.br)?/(?P<artist>[^/]+)/(?P<id>[^/?&#]+)/?#clipe'
+ _TESTS = [{
+ 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/maiara-e-maraisa-voce-faz-falta-aqui-ao-vivo-em-vicosa-mg/#clipe',
+ 'add_ie': ['Youtube'],
+ 'info_dict': {
+ 'id': '_pD1nR2qqPg',
+ 'ext': 'mp4',
+ 'title': 'Maiara e Maraisa - Você Faz Falta Aqui - DVD Ao Vivo Em Campo Grande',
+ 'description': 'md5:7043342c09a224598e93546e98e49282',
+ 'upload_date': '20161107',
+ 'uploader_id': 'maiaramaraisaoficial',
+ 'uploader': 'Maiara e Maraisa',
+ }
+ }]
+ _MUSIC_FIELDS = 'youtubeID'
+
+ def _parse_music(self, music):
+ youtube_id = music['youtubeID']
+ return self.url_result(youtube_id, 'Youtube', youtube_id)
diff --git a/youtube_dl/extractor/peertube.py b/youtube_dl/extractor/peertube.py
index c39d12728..3af533925 100644
--- a/youtube_dl/extractor/peertube.py
+++ b/youtube_dl/extractor/peertube.py
@@ -413,7 +413,8 @@ class PeerTubeIE(InfoExtractor):
peertube3\.cpy\.re|
peertube2\.cpy\.re|
videos\.tcit\.fr|
- peertube\.cpy\.re
+ peertube\.cpy\.re|
+ canard\.tube
)'''
_UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
_API_BASE = 'https://%s/api/v1/videos/%s/%s'
@@ -451,6 +452,18 @@ class PeerTubeIE(InfoExtractor):
'categories': ['Science & Technology'],
}
}, {
+ # Issue #26002
+ 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc',
+ 'info_dict': {
+ 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc',
+ 'ext': 'mp4',
+ 'title': 'Dot matrix printer shell demo',
+ 'uploader_id': '3',
+ 'timestamp': 1587401293,
+ 'upload_date': '20200420',
+ 'uploader': 'Drew DeVault',
+ }
+ }, {
'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
'only_matching': True,
}, {
@@ -526,7 +539,15 @@ class PeerTubeIE(InfoExtractor):
title = video['name']
formats = []
- for file_ in video['files']:
+ files = video.get('files') or []
+ for playlist in (video.get('streamingPlaylists') or []):
+ if not isinstance(playlist, dict):
+ continue
+ playlist_files = playlist.get('files')
+ if not (playlist_files and isinstance(playlist_files, list)):
+ continue
+ files.extend(playlist_files)
+ for file_ in files:
if not isinstance(file_, dict):
continue
file_url = url_or_none(file_.get('fileUrl'))
@@ -548,15 +569,15 @@ class PeerTubeIE(InfoExtractor):
formats.append(f)
self._sort_formats(formats)
- full_description = self._call_api(
- host, video_id, 'description', note='Downloading description JSON',
- fatal=False)
+ description = video.get('description')
+ if len(description) >= 250:
+ # description is shortened
+ full_description = self._call_api(
+ host, video_id, 'description', note='Downloading description JSON',
+ fatal=False)
- description = None
- if isinstance(full_description, dict):
- description = str_or_none(full_description.get('description'))
- if not description:
- description = video.get('description')
+ if isinstance(full_description, dict):
+ description = str_or_none(full_description.get('description')) or description
subtitles = self.extract_subtitles(host, video_id)
@@ -578,11 +599,13 @@ class PeerTubeIE(InfoExtractor):
else:
age_limit = None
+ webpage_url = 'https://%s/videos/watch/%s' % (host, video_id)
+
return {
'id': video_id,
'title': title,
'description': description,
- 'thumbnail': urljoin(url, video.get('thumbnailPath')),
+ 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')),
'timestamp': unified_timestamp(video.get('publishedAt')),
'uploader': account_data('displayName', compat_str),
'uploader_id': str_or_none(account_data('id', int)),
@@ -600,5 +623,6 @@ class PeerTubeIE(InfoExtractor):
'tags': try_get(video, lambda x: x['tags'], list),
'categories': categories,
'formats': formats,
- 'subtitles': subtitles
+ 'subtitles': subtitles,
+ 'webpage_url': webpage_url,
}
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index b15906390..b93a02b7d 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -12,6 +12,10 @@ from ..utils import (
class PeriscopeBaseIE(InfoExtractor):
+ _M3U8_HEADERS = {
+ 'Referer': 'https://www.periscope.tv/'
+ }
+
def _call_api(self, method, query, item_id):
return self._download_json(
'https://api.periscope.tv/api/v2/%s' % method,
@@ -54,9 +58,11 @@ class PeriscopeBaseIE(InfoExtractor):
m3u8_url, video_id, 'mp4',
entry_protocol='m3u8_native'
if state in ('ended', 'timed_out') else 'm3u8',
- m3u8_id=format_id, fatal=fatal)
+ m3u8_id=format_id, fatal=fatal, headers=self._M3U8_HEADERS)
if len(m3u8_formats) == 1:
self._add_width_and_height(m3u8_formats[0], width, height)
+ for f in m3u8_formats:
+ f.setdefault('http_headers', {}).update(self._M3U8_HEADERS)
return m3u8_formats
diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py
index e435c28e1..e3ea01443 100644
--- a/youtube_dl/extractor/phoenix.py
+++ b/youtube_dl/extractor/phoenix.py
@@ -1,45 +1,133 @@
+# coding: utf-8
from __future__ import unicode_literals
-from .dreisat import DreiSatIE
+import re
+from .youtube import YoutubeIE
+from .zdf import ZDFBaseIE
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ merge_dicts,
+ try_get,
+ unified_timestamp,
+ urljoin,
+)
-class PhoenixIE(DreiSatIE):
+
+class PhoenixIE(ZDFBaseIE):
IE_NAME = 'phoenix.de'
- _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/
- (?:
- phoenix/die_sendungen/(?:[^/]+/)?
- )?
- (?P<id>[0-9]+)'''
- _TESTS = [
- {
- 'url': 'http://www.phoenix.de/content/884301',
- 'md5': 'ed249f045256150c92e72dbb70eadec6',
- 'info_dict': {
- 'id': '884301',
- 'ext': 'mp4',
- 'title': 'Michael Krons mit Hans-Werner Sinn',
- 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
- 'upload_date': '20141025',
- 'uploader': 'Im Dialog',
- }
+ _VALID_URL = r'https?://(?:www\.)?phoenix\.de/(?:[^/]+/)*[^/?#&]*-a-(?P<id>\d+)\.html'
+ _TESTS = [{
+ # Same as https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html
+ 'url': 'https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html',
+ 'md5': '34ec321e7eb34231fd88616c65c92db0',
+ 'info_dict': {
+ 'id': '210222_phx_nachgehakt_corona_protest',
+ 'ext': 'mp4',
+ 'title': 'Wohin führt der Protest in der Pandemie?',
+ 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
+ 'duration': 1691,
+ 'timestamp': 1613902500,
+ 'upload_date': '20210221',
+ 'uploader': 'Phoenix',
+ 'series': 'corona nachgehakt',
+ 'episode': 'Wohin führt der Protest in der Pandemie?',
},
- {
- 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815',
- 'only_matching': True,
+ }, {
+ # Youtube embed
+ 'url': 'https://www.phoenix.de/sendungen/gespraeche/phoenix-streitgut-brennglas-corona-a-1965505.html',
+ 'info_dict': {
+ 'id': 'hMQtqFYjomk',
+ 'ext': 'mp4',
+ 'title': 'phoenix streitgut: Brennglas Corona - Wie gerecht ist unsere Gesellschaft?',
+ 'description': 'md5:ac7a02e2eb3cb17600bc372e4ab28fdd',
+ 'duration': 3509,
+ 'upload_date': '20201219',
+ 'uploader': 'phoenix',
+ 'uploader_id': 'phoenix',
},
- {
- 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234',
- 'only_matching': True,
+ 'params': {
+ 'skip_download': True,
},
- ]
+ }, {
+ 'url': 'https://www.phoenix.de/entwicklungen-in-russland-a-2044720.html',
+ 'only_matching': True,
+ }, {
+ # no media
+ 'url': 'https://www.phoenix.de/sendungen/dokumentationen/mit-dem-jumbo-durch-die-nacht-a-89625.html',
+ 'only_matching': True,
+ }, {
+ # Same as https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html
+ 'url': 'https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ article_id = self._match_id(url)
+
+ article = self._download_json(
+ 'https://www.phoenix.de/response/id/%s' % article_id, article_id,
+ 'Downloading article JSON')
+
+ video = article['absaetze'][0]
+ title = video.get('titel') or article.get('subtitel')
+
+ if video.get('typ') == 'video-youtube':
+ video_id = video['id']
+ return self.url_result(
+ video_id, ie=YoutubeIE.ie_key(), video_id=video_id,
+ video_title=title)
+
+ video_id = compat_str(video.get('basename') or video.get('content'))
- internal_id = self._search_regex(
- r'<div class="phx_vod" id="phx_vod_([0-9]+)"',
- webpage, 'internal video ID')
+ details = self._download_json(
+ 'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php',
+ video_id, 'Downloading details JSON', query={
+ 'ak': 'web',
+ 'ptmd': 'true',
+ 'id': video_id,
+ 'profile': 'player2',
+ })
+
+ title = title or details['title']
+ content_id = details['tracking']['nielsen']['content']['assetid']
+
+ info = self._extract_ptmd(
+ 'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id,
+ content_id, None, url)
+
+ duration = int_or_none(try_get(
+ details, lambda x: x['tracking']['nielsen']['content']['length']))
+ timestamp = unified_timestamp(details.get('editorialDate'))
+ series = try_get(
+ details, lambda x: x['tracking']['nielsen']['content']['program'],
+ compat_str)
+ episode = title if details.get('contentType') == 'episode' else None
+
+ thumbnails = []
+ teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {}
+ for thumbnail_key, thumbnail_url in teaser_images.items():
+ thumbnail_url = urljoin(url, thumbnail_url)
+ if not thumbnail_url:
+ continue
+ thumbnail = {
+ 'url': thumbnail_url,
+ }
+ m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
+ if m:
+ thumbnail['width'] = int(m.group(1))
+ thumbnail['height'] = int(m.group(2))
+ thumbnails.append(thumbnail)
- api_url = 'http://www.phoenix.de/php/mediaplayer/data/beitrags_details.php?ak=web&id=%s' % internal_id
- return self.extract_from_xml_url(video_id, api_url)
+ return merge_dicts(info, {
+ 'id': content_id,
+ 'title': title,
+ 'description': details.get('leadParagraph'),
+ 'duration': duration,
+ 'thumbnails': thumbnails,
+ 'timestamp': timestamp,
+ 'uploader': details.get('tvService'),
+ 'series': series,
+ 'episode': episode,
+ })
diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py
index 8099ef1d6..e6c51e16b 100644
--- a/youtube_dl/extractor/picarto.py
+++ b/youtube_dl/extractor/picarto.py
@@ -1,22 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-import time
-
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
ExtractorError,
js_to_json,
- try_get,
- update_url_query,
- urlencode_postdata,
)
class PicartoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?'
+ _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'https://picarto.tv/Setz',
'info_dict': {
@@ -34,65 +27,46 @@ class PicartoIE(InfoExtractor):
return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
-
- metadata = self._download_json(
- 'https://api.picarto.tv/v1/channel/name/' + channel_id,
- channel_id)
-
- if metadata.get('online') is False:
+ channel_id = self._match_id(url)
+
+ data = self._download_json(
+ 'https://ptvintern.picarto.tv/ptvapi', channel_id, query={
+ 'query': '''{
+ channel(name: "%s") {
+ adult
+ id
+ online
+ stream_name
+ title
+ }
+ getLoadBalancerUrl(channel_name: "%s") {
+ url
+ }
+}''' % (channel_id, channel_id),
+ })['data']
+ metadata = data['channel']
+
+ if metadata.get('online') == 0:
raise ExtractorError('Stream is offline', expected=True)
+ title = metadata['title']
cdn_data = self._download_json(
- 'https://picarto.tv/process/channel', channel_id,
- data=urlencode_postdata({'loadbalancinginfo': channel_id}),
- note='Downloading load balancing info')
-
- token = mobj.group('token') or 'public'
- params = {
- 'con': int(time.time() * 1000),
- 'token': token,
- }
+ data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js',
+ channel_id, 'Downloading load balancing info')
- prefered_edge = cdn_data.get('preferedEdge')
formats = []
-
- for edge in cdn_data['edges']:
- edge_ep = edge.get('ep')
- if not edge_ep or not isinstance(edge_ep, compat_str):
+ for source in (cdn_data.get('source') or []):
+ source_url = source.get('url')
+ if not source_url:
continue
- edge_id = edge.get('id')
- for tech in cdn_data['techs']:
- tech_label = tech.get('label')
- tech_type = tech.get('type')
- preference = 0
- if edge_id == prefered_edge:
- preference += 1
- format_id = []
- if edge_id:
- format_id.append(edge_id)
- if tech_type == 'application/x-mpegurl' or tech_label == 'HLS':
- format_id.append('hls')
- formats.extend(self._extract_m3u8_formats(
- update_url_query(
- 'https://%s/hls/%s/index.m3u8'
- % (edge_ep, channel_id), params),
- channel_id, 'mp4', preference=preference,
- m3u8_id='-'.join(format_id), fatal=False))
- continue
- elif tech_type == 'video/mp4' or tech_label == 'MP4':
- format_id.append('mp4')
- formats.append({
- 'url': update_url_query(
- 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id),
- params),
- 'format_id': '-'.join(format_id),
- 'preference': preference,
- })
- else:
- # rtmp format does not seem to work
- continue
+ source_type = source.get('type')
+ if source_type == 'html5/application/vnd.apple.mpegurl':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, channel_id, 'mp4', m3u8_id='hls', fatal=False))
+ elif source_type == 'html5/video/mp4':
+ formats.append({
+ 'url': source_url,
+ })
self._sort_formats(formats)
mature = metadata.get('adult')
@@ -103,10 +77,10 @@ class PicartoIE(InfoExtractor):
return {
'id': channel_id,
- 'title': self._live_title(metadata.get('title') or channel_id),
+ 'title': self._live_title(title.strip()),
'is_live': True,
- 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']),
'channel': channel_id,
+ 'channel_id': metadata.get('id'),
'channel_url': 'https://picarto.tv/%s' % channel_id,
'age_limit': age_limit,
'formats': formats,
diff --git a/youtube_dl/extractor/pinterest.py b/youtube_dl/extractor/pinterest.py
index b249c9eda..42528d746 100644
--- a/youtube_dl/extractor/pinterest.py
+++ b/youtube_dl/extractor/pinterest.py
@@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor):
title = (data.get('title') or data.get('grid_title') or video_id).strip()
+ urls = []
formats = []
duration = None
if extract_formats:
@@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor):
if not isinstance(format_dict, dict):
continue
format_url = url_or_none(format_dict.get('url'))
- if not format_url:
+ if not format_url or format_url in urls:
continue
+ urls.append(format_url)
duration = float_or_none(format_dict.get('duration'), scale=1000)
ext = determine_ext(format_url)
if 'hls' in format_id.lower() or ext == 'm3u8':
diff --git a/youtube_dl/extractor/playstuff.py b/youtube_dl/extractor/playstuff.py
new file mode 100644
index 000000000..5a329957f
--- /dev/null
+++ b/youtube_dl/extractor/playstuff.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ smuggle_url,
+ try_get,
+)
+
+
+class PlayStuffIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a',
+ 'md5': 'c82d3669e5247c64bc382577843e5bd0',
+ 'info_dict': {
+ 'id': '6250584958001',
+ 'ext': 'mp4',
+ 'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga',
+ 'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913',
+ 'uploader_id': '6005208634001',
+ 'timestamp': 1619491027,
+ 'upload_date': '20210427',
+ },
+ 'add_ie': ['BrightcoveNew'],
+ }, {
+ # geo restricted, bypassable
+ 'url': 'https://play.stuff.co.nz/details/_6155660351001',
+ 'only_matching': True,
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ state = self._parse_json(
+ self._search_regex(
+ r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'),
+ video_id)
+
+ account_id = try_get(
+ state, lambda x: x['configurations']['accountId'],
+ compat_str) or '6005208634001'
+ player_id = try_get(
+ state, lambda x: x['configurations']['playerId'],
+ compat_str) or 'default'
+
+ entries = []
+ for item_id, video in state['items'].items():
+ if not isinstance(video, dict):
+ continue
+ asset_id = try_get(
+ video, lambda x: x['content']['attributes']['assetId'],
+ compat_str)
+ if not asset_id:
+ continue
+ entries.append(self.url_result(
+ smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id),
+ {'geo_countries': ['NZ']}),
+ 'BrightcoveNew', video_id))
+
+ return self.playlist_result(entries, video_id)
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
index abd08bc28..2d63855df 100644
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -393,7 +393,7 @@ query viewClip {
# To somewhat reduce the probability of these consequences
# we will sleep random amount of time before each call to ViewClip.
self._sleep(
- random.randint(2, 5), display_id,
+ random.randint(5, 10), display_id,
'%(video_id)s: Waiting for %(timeout)s seconds to avoid throttling')
if not viewclip:
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index 2fcbd186f..e2e1500ff 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -22,11 +22,16 @@ from ..utils import (
orderedSet,
remove_quotes,
str_to_int,
+ update_url_query,
+ urlencode_postdata,
url_or_none,
)
class PornHubBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'pornhub'
+ _PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
+
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
@@ -52,17 +57,79 @@ class PornHubBaseIE(InfoExtractor):
return webpage, urlh
+ def _real_initialize(self):
+ self._logged_in = False
+
+ def _login(self, host):
+ if self._logged_in:
+ return
+
+ site = host.split('.')[0]
+
+ # Both sites pornhub and pornhubpremium have separate accounts
+ # so there should be an option to provide credentials for both.
+ # At the same time some videos are available under the same video id
+ # on both sites so that we have to identify them as the same video.
+ # For that purpose we have to keep both in the same extractor
+ # but under different netrc machines.
+ username, password = self._get_login_info(netrc_machine=site)
+ if username is None:
+ return
+
+ login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '')
+ login_page = self._download_webpage(
+ login_url, None, 'Downloading %s login page' % site)
+
+ def is_logged(webpage):
+ return any(re.search(p, webpage) for p in (
+ r'class=["\']signOut',
+ r'>Sign\s+[Oo]ut\s*<'))
+
+ if is_logged(login_page):
+ self._logged_in = True
+ return
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'username': username,
+ 'password': password,
+ })
+
+ response = self._download_json(
+ 'https://www.%s/front/authenticate' % host, None,
+ 'Logging in to %s' % site,
+ data=urlencode_postdata(login_form),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+ 'Referer': login_url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ })
+
+ if response.get('success') == '1':
+ self._logged_in = True
+ return
+
+ message = response.get('message')
+ if message is not None:
+ raise ExtractorError(
+ 'Unable to login: %s' % message, expected=True)
+
+ raise ExtractorError('Unable to log in')
+
class PornHubIE(PornHubBaseIE):
IE_DESC = 'PornHub and Thumbzilla'
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+ (?:[^/]+\.)?
+ %s
+ /(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/
)
(?P<id>[\da-z]+)
- '''
+ ''' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'md5': 'a6391306d050e4547f62b3f485dd9ba9',
@@ -103,6 +170,7 @@ class PornHubIE(PornHubBaseIE):
'params': {
'skip_download': True,
},
+ 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
}, {
# subtitles
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
@@ -163,12 +231,27 @@ class PornHubIE(PornHubBaseIE):
}, {
'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
'only_matching': True,
+ }, {
+ # Some videos are available with the same id on both premium
+ # and non-premium sites (e.g. this and the following test)
+ 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
+ 'only_matching': True,
+ }, {
+ # geo restricted
+ 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
+ 'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
- r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net|org)/embed/[\da-z]+)',
+ r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)',
webpage)
def _extract_count(self, pattern, webpage, name):
@@ -180,12 +263,7 @@ class PornHubIE(PornHubBaseIE):
host = mobj.group('host') or 'pornhub.com'
video_id = mobj.group('id')
- if 'premium' in host:
- if not self._downloader.params.get('cookiefile'):
- raise ExtractorError(
- 'PornHub Premium requires authentication.'
- ' You may want to use --cookies.',
- expected=True)
+ self._login(host)
self._set_cookie(host, 'age_verified', '1')
@@ -198,7 +276,8 @@ class PornHubIE(PornHubBaseIE):
webpage = dl_webpage('pc')
error_msg = self._html_search_regex(
- r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+ (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+ r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
webpage, 'error message', default=None, group='error')
if error_msg:
error_msg = re.sub(r'\s+', ' ', error_msg)
@@ -206,6 +285,11 @@ class PornHubIE(PornHubBaseIE):
'PornHub said: %s' % error_msg,
expected=True, video_id=video_id)
+ if any(re.search(p, webpage) for p in (
+ r'class=["\']geoBlocked["\']',
+ r'>\s*This content is unavailable in your country')):
+ self.raise_geo_restricted()
+
# video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore.
@@ -327,35 +411,49 @@ class PornHubIE(PornHubBaseIE):
upload_date = None
formats = []
- for video_url, height in video_urls:
- if not upload_date:
- upload_date = self._search_regex(
- r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
- if upload_date:
- upload_date = upload_date.replace('/', '')
- ext = determine_ext(video_url)
+
+ def add_format(format_url, height=None):
+ ext = determine_ext(format_url)
if ext == 'mpd':
formats.extend(self._extract_mpd_formats(
- video_url, video_id, mpd_id='dash', fatal=False))
- continue
- elif ext == 'm3u8':
+ format_url, video_id, mpd_id='dash', fatal=False))
+ return
+ if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
- continue
- tbr = None
- mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
- if mobj:
- if not height:
- height = int(mobj.group('height'))
- tbr = int(mobj.group('tbr'))
+ return
+ if not height:
+ height = int_or_none(self._search_regex(
+ r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
+ default=None))
formats.append({
- 'url': video_url,
+ 'url': format_url,
'format_id': '%dp' % height if height else None,
'height': height,
- 'tbr': tbr,
})
- self._sort_formats(formats)
+
+ for video_url, height in video_urls:
+ if not upload_date:
+ upload_date = self._search_regex(
+ r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
+ if upload_date:
+ upload_date = upload_date.replace('/', '')
+ if '/video/get_media' in video_url:
+ medias = self._download_json(video_url, video_id, fatal=False)
+ if isinstance(medias, list):
+ for media in medias:
+ if not isinstance(media, dict):
+ continue
+ video_url = url_or_none(media.get('videoUrl'))
+ if not video_url:
+ continue
+ height = int_or_none(media.get('quality'))
+ add_format(video_url, height)
+ continue
+ add_format(video_url)
+ self._sort_formats(
+ formats, field_preference=('height', 'width', 'fps', 'format_id'))
video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@@ -405,6 +503,10 @@ class PornHubIE(PornHubBaseIE):
class PornHubPlaylistBaseIE(PornHubBaseIE):
+ def _extract_page(self, url):
+ return int_or_none(self._search_regex(
+ r'\bpage=(\d+)', url, 'page', default=None))
+
def _extract_entries(self, webpage, host):
# Only process container div with main playlist content skipping
# drop-down menu that uses similar pattern for videos (see
@@ -422,29 +524,9 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):
container))
]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- host = mobj.group('host')
- playlist_id = mobj.group('id')
-
- webpage = self._download_webpage(url, playlist_id)
-
- entries = self._extract_entries(webpage, host)
-
- playlist = self._parse_json(
- self._search_regex(
- r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
- 'playlist', default='{}'),
- playlist_id, fatal=False)
- title = playlist.get('title') or self._search_regex(
- r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
-
- return self.playlist_result(
- entries, playlist_id, title, playlist.get('description'))
-
class PornHubUserIE(PornHubPlaylistBaseIE):
- _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+ _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'https://www.pornhub.com/model/zoe_ph',
'playlist_mincount': 118,
@@ -463,14 +545,30 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
}, {
'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
'only_matching': True,
+ }, {
+ # Unavailable via /videos page, but available with direct pagination
+ # on pornstar page (see [1]), requires premium
+ # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
+ 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west',
+ 'only_matching': True,
+ }, {
+ # Same as before, multi page
+ 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
+ 'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user_id = mobj.group('id')
+ videos_url = '%s/videos' % mobj.group('url')
+ page = self._extract_page(url)
+ if page:
+ videos_url = update_url_query(videos_url, {'page': page})
return self.url_result(
- '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
- video_id=user_id)
+ videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id)
class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
@@ -483,36 +581,59 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
<button[^>]+\bid=["\']moreDataBtn
''', webpage) is not None
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- host = mobj.group('host')
- item_id = mobj.group('id')
+ def _entries(self, url, host, item_id):
+ page = self._extract_page(url)
- page = int_or_none(self._search_regex(
- r'\bpage=(\d+)', url, 'page', default=None))
+ VIDEOS = '/videos'
- entries = []
- for page_num in (page, ) if page is not None else itertools.count(1):
+ def download_page(base_url, num, fallback=False):
+ note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '')
+ return self._download_webpage(
+ base_url, item_id, note, query={'page': num})
+
+ def is_404(e):
+ return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404
+
+ base_url = url
+ has_page = page is not None
+ first_page = page if has_page else 1
+ for page_num in (first_page, ) if has_page else itertools.count(first_page):
try:
- webpage = self._download_webpage(
- url, item_id, 'Downloading page %d' % page_num,
- query={'page': page_num})
+ try:
+ webpage = download_page(base_url, page_num)
+ except ExtractorError as e:
+ # Some sources may not be available via /videos page,
+ # trying to fallback to main page pagination (see [1])
+ # 1. https://github.com/ytdl-org/youtube-dl/issues/27853
+ if is_404(e) and page_num == first_page and VIDEOS in base_url:
+ base_url = base_url.replace(VIDEOS, '')
+ webpage = download_page(base_url, page_num, fallback=True)
+ else:
+ raise
except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
+ if is_404(e) and page_num != first_page:
break
raise
page_entries = self._extract_entries(webpage, host)
if not page_entries:
break
- entries.extend(page_entries)
+ for e in page_entries:
+ yield e
if not self._has_more(webpage):
break
- return self.playlist_result(orderedSet(entries), item_id)
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host')
+ item_id = mobj.group('id')
+
+ self._login(host)
+
+ return self.playlist_result(self._entries(url, host, item_id), item_id)
class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
- _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+ _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'https://www.pornhub.com/model/zoe_ph/videos',
'only_matching': True,
@@ -617,6 +738,9 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
}, {
'url': 'https://de.pornhub.com/playlist/4667351',
'only_matching': True,
+ }, {
+ 'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
+ 'only_matching': True,
}]
@classmethod
@@ -627,7 +751,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
- _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+ _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{
'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
'info_dict': {
@@ -637,4 +761,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
}, {
'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
'only_matching': True,
+ }, {
+ 'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
+ 'only_matching': True,
}]
diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py
index 8c016a77d..0c497856e 100644
--- a/youtube_dl/extractor/rds.py
+++ b/youtube_dl/extractor/rds.py
@@ -15,17 +15,17 @@ class RDSIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'
_TESTS = [{
- 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
+ # has two 9c9media ContentPackages, the web player selects the first ContentPackage
+ 'url': 'https://www.rds.ca/videos/Hockey/NationalHockeyLeague/teams/9/forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande-3.1377606',
'info_dict': {
- 'id': '604333',
- 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
+ 'id': '2083309',
+ 'display_id': 'forum-du-5-a-7-jesperi-kotkaniemi-de-retour-de-finlande',
'ext': 'flv',
- 'title': 'Fowler Jr. prend la direction de Jacksonville',
- 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ',
- 'timestamp': 1430397346,
- 'upload_date': '20150430',
- 'duration': 154.354,
- 'age_limit': 0,
+ 'title': 'Forum du 5 à 7 : Kotkaniemi de retour de Finlande',
+ 'description': 'md5:83fa38ecc4a79b19e433433254077f25',
+ 'timestamp': 1606129030,
+ 'upload_date': '20201123',
+ 'duration': 773.039,
}
}, {
'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934',
diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py
index 3aae79f5d..6d000b372 100644
--- a/youtube_dl/extractor/redbulltv.py
+++ b/youtube_dl/extractor/redbulltv.py
@@ -133,8 +133,10 @@ class RedBullEmbedIE(RedBullTVIE):
rrn_id = self._match_id(url)
asset_id = self._download_json(
'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql',
- rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'},
- query={
+ rrn_id, headers={
+ 'Accept': 'application/json',
+ 'API-KEY': 'e90a1ff11335423998b100c929ecc866',
+ }, query={
'query': '''{
resource(id: "%s", enforceGeoBlocking: false) {
%s
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
index 48f17b828..aed35f8a9 100644
--- a/youtube_dl/extractor/rts.py
+++ b/youtube_dl/extractor/rts.py
@@ -6,11 +6,12 @@ import re
from .srgssr import SRGSSRIE
from ..compat import compat_str
from ..utils import (
+ determine_ext,
int_or_none,
parse_duration,
parse_iso8601,
unescapeHTML,
- determine_ext,
+ urljoin,
)
@@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE):
_TESTS = [
{
'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
- 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e',
+ 'md5': '753b877968ad8afaeddccc374d4256a5',
'info_dict': {
'id': '3449373',
'display_id': 'les-enfants-terribles',
@@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE):
'thumbnail': r're:^https?://.*\.image',
'view_count': int,
},
+ 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
},
{
'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
@@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE):
# m3u8 download
'skip_download': True,
},
+ 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
'skip': 'Blocked outside Switzerland',
},
{
'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
- 'md5': '1bae984fe7b1f78e94abc74e802ed99f',
+ 'md5': '9bb06503773c07ce83d3cbd793cebb91',
'info_dict': {
'id': '5745356',
'display_id': 'londres-cachee-par-un-epais-smog',
@@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE):
'thumbnail': r're:^https?://.*\.image',
'view_count': int,
},
+ 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'],
},
{
'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
@@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE):
media_type = 'video' if 'video' in all_info else 'audio'
# check for errors
- self.get_media_data('rts', media_type, media_id)
+ self._get_media_data('rts', media_type, media_id)
info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
@@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE):
'tbr': extract_bitrate(format_url),
})
+ download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '')
for media in info.get('media', []):
media_url = media.get('url')
if not media_url or re.match(r'https?://', media_url):
@@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE):
format_id += '-%dk' % rate
formats.append({
'format_id': format_id,
- 'url': 'http://download-video.rts.ch/' + media_url,
+ 'url': urljoin(download_base, media_url),
'tbr': rate or extract_bitrate(media_url),
})
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
index ce9db0629..d2fb754cf 100644
--- a/youtube_dl/extractor/rtve.py
+++ b/youtube_dl/extractor/rtve.py
@@ -2,8 +2,9 @@
from __future__ import unicode_literals
import base64
+import io
import re
-import time
+import sys
from .common import InfoExtractor
from ..compat import (
@@ -14,56 +15,13 @@ from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
+ qualities,
remove_end,
remove_start,
- sanitized_Request,
std_headers,
)
-
-def _decrypt_url(png):
- encrypted_data = compat_b64decode(png)
- text_index = encrypted_data.find(b'tEXt')
- text_chunk = encrypted_data[text_index - 4:]
- length = compat_struct_unpack('!I', text_chunk[:4])[0]
- # Use bytearray to get integers when iterating in both python 2.x and 3.x
- data = bytearray(text_chunk[8:8 + length])
- data = [chr(b) for b in data if b != 0]
- hash_index = data.index('#')
- alphabet_data = data[:hash_index]
- url_data = data[hash_index + 1:]
- if url_data[0] == 'H' and url_data[3] == '%':
- # remove useless HQ%% at the start
- url_data = url_data[4:]
-
- alphabet = []
- e = 0
- d = 0
- for l in alphabet_data:
- if d == 0:
- alphabet.append(l)
- d = e = (e + 1) % 4
- else:
- d -= 1
- url = ''
- f = 0
- e = 3
- b = 1
- for letter in url_data:
- if f == 0:
- l = int(letter) * 10
- f = 1
- else:
- if e == 0:
- l += int(letter)
- url += alphabet[l]
- e = (b + 3) % 4
- f = 0
- b += 1
- else:
- e -= 1
-
- return url
+_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x))
class RTVEALaCartaIE(InfoExtractor):
@@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
'duration': 5024.566,
+ 'series': 'Balonmano',
},
+ 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
}, {
'note': 'Live stream',
'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/',
'info_dict': {
'id': '1694255',
- 'ext': 'flv',
- 'title': 'TODO',
+ 'ext': 'mp4',
+ 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'is_live': True,
+ },
+ 'params': {
+ 'skip_download': 'live stream',
},
- 'skip': 'The f4m manifest can\'t be used yet',
}, {
'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/',
- 'md5': 'e55e162379ad587e9640eda4f7353c0f',
+ 'md5': 'd850f3c8731ea53952ebab489cf81cbf',
'info_dict': {
'id': '4236788',
'ext': 'mp4',
- 'title': 'Servir y proteger - Capítulo 104 ',
+ 'title': 'Servir y proteger - Capítulo 104',
'duration': 3222.0,
},
- 'params': {
- 'skip_download': True, # requires ffmpeg
- },
+ 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
}, {
'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve',
'only_matching': True,
@@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor):
def _real_initialize(self):
user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
- manager_info = self._download_json(
+ self._manager = self._download_json(
'http://www.rtve.es/odin/loki/' + user_agent_b64,
- None, 'Fetching manager info')
- self._manager = manager_info['manager']
+ None, 'Fetching manager info')['manager']
+
+ @staticmethod
+ def _decrypt_url(png):
+ encrypted_data = io.BytesIO(compat_b64decode(png)[8:])
+ while True:
+ length = compat_struct_unpack('!I', encrypted_data.read(4))[0]
+ chunk_type = encrypted_data.read(4)
+ if chunk_type == b'IEND':
+ break
+ data = encrypted_data.read(length)
+ if chunk_type == b'tEXt':
+ alphabet_data, text = data.split(b'\0')
+ quality, url_data = text.split(b'%%')
+ alphabet = []
+ e = 0
+ d = 0
+ for l in _bytes_to_chr(alphabet_data):
+ if d == 0:
+ alphabet.append(l)
+ d = e = (e + 1) % 4
+ else:
+ d -= 1
+ url = ''
+ f = 0
+ e = 3
+ b = 1
+ for letter in _bytes_to_chr(url_data):
+ if f == 0:
+ l = int(letter) * 10
+ f = 1
+ else:
+ if e == 0:
+ l += int(letter)
+ url += alphabet[l]
+ e = (b + 3) % 4
+ f = 0
+ b += 1
+ else:
+ e -= 1
+
+ yield quality.decode(), url
+ encrypted_data.read(4) # CRC
+
+ def _extract_png_formats(self, video_id):
+ png = self._download_webpage(
+ 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id),
+ video_id, 'Downloading url information', query={'q': 'v2'})
+ q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL'])
+ formats = []
+ for quality, video_url in self._decrypt_url(png):
+ ext = determine_ext(video_url)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ video_url, video_id, 'dash', fatal=False))
+ else:
+ formats.append({
+ 'format_id': quality,
+ 'quality': q(quality),
+ 'url': video_url,
+ })
+ self._sort_formats(formats)
+ return formats
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
info = self._download_json(
'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
video_id)['page']['items'][0]
if info['state'] == 'DESPU':
raise ExtractorError('The video is no longer available', expected=True)
- title = info['title']
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id)
- png_request = sanitized_Request(png_url)
- png_request.add_header('Referer', url)
- png = self._download_webpage(png_request, video_id, 'Downloading url information')
- video_url = _decrypt_url(png)
- ext = determine_ext(video_url)
-
- formats = []
- if not video_url.endswith('.f4m') and ext != 'm3u8':
- if '?' not in video_url:
- video_url = video_url.replace('resources/', 'auth/resources/')
- video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve')
-
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- video_url, video_id, f4m_id='hds', fatal=False))
- else:
- formats.append({
- 'url': video_url,
- })
- self._sort_formats(formats)
+ title = info['title'].strip()
+ formats = self._extract_png_formats(video_id)
subtitles = None
- if info.get('sbtFile') is not None:
- subtitles = self.extract_subtitles(video_id, info['sbtFile'])
+ sbt_file = info.get('sbtFile')
+ if sbt_file:
+ subtitles = self.extract_subtitles(video_id, sbt_file)
+
+ is_live = info.get('live') is True
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'formats': formats,
'thumbnail': info.get('image'),
- 'page_url': url,
'subtitles': subtitles,
- 'duration': float_or_none(info.get('duration'), scale=1000),
+ 'duration': float_or_none(info.get('duration'), 1000),
+ 'is_live': is_live,
+ 'series': info.get('programTitle'),
}
def _get_subtitles(self, video_id, sub_file):
@@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor):
for s in subs)
-class RTVEInfantilIE(InfoExtractor):
+class RTVEInfantilIE(RTVEALaCartaIE):
IE_NAME = 'rtve.es:infantil'
IE_DESC = 'RTVE infantil'
- _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P<show>[^/]*)/video/(?P<short_title>[^/]*)/(?P<id>[0-9]+)/'
+ _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/'
_TESTS = [{
'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/',
- 'md5': '915319587b33720b8e0357caaa6617e6',
+ 'md5': '5747454717aedf9f9fdf212d1bcfc48d',
'info_dict': {
'id': '3040283',
'ext': 'mp4',
'title': 'Maneras de vivir',
- 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG',
+ 'thumbnail': r're:https?://.+/1426182947956\.JPG',
'duration': 357.958,
},
+ 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'],
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- info = self._download_json(
- 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
- video_id)['page']['items'][0]
-
- webpage = self._download_webpage(url, video_id)
- vidplayer_id = self._search_regex(
- r' id="vidplayer([0-9]+)"', webpage, 'internal video ID')
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id
- png = self._download_webpage(png_url, video_id, 'Downloading url information')
- video_url = _decrypt_url(png)
-
- return {
- 'id': video_id,
- 'ext': 'mp4',
- 'title': info['title'],
- 'url': video_url,
- 'thumbnail': info.get('image'),
- 'duration': float_or_none(info.get('duration'), scale=1000),
- }
-
-
-class RTVELiveIE(InfoExtractor):
+class RTVELiveIE(RTVEALaCartaIE):
IE_NAME = 'rtve.es:live'
IE_DESC = 'RTVE.es live streams'
_VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)'
@@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor):
'info_dict': {
'id': 'la-1',
'ext': 'mp4',
- 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$',
+ 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
},
'params': {
'skip_download': 'live stream',
@@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- start_time = time.gmtime()
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es')
title = remove_start(title, 'Estoy viendo ')
- title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time)
vidplayer_id = self._search_regex(
(r'playerId=player([0-9]+)',
r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)',
r'data-id=["\'](\d+)'),
webpage, 'internal video ID')
- png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id
- png = self._download_webpage(png_url, video_id, 'Downloading url information')
- m3u8_url = _decrypt_url(png)
- formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')
- self._sort_formats(formats)
return {
'id': video_id,
- 'title': title,
- 'formats': formats,
+ 'title': self._live_title(title),
+ 'formats': self._extract_png_formats(vidplayer_id),
'is_live': True,
}
diff --git a/youtube_dl/extractor/samplefocus.py b/youtube_dl/extractor/samplefocus.py
new file mode 100644
index 000000000..806c3c354
--- /dev/null
+++ b/youtube_dl/extractor/samplefocus.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ extract_attributes,
+ get_element_by_attribute,
+ int_or_none,
+)
+
+
+class SampleFocusIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P<id>[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar',
+ 'md5': '48c8d62d60be467293912e0e619a5120',
+ 'info_dict': {
+ 'id': '40316',
+ 'display_id': 'lil-peep-sad-emo-guitar',
+ 'ext': 'mp3',
+ 'title': 'Lil Peep Sad Emo Guitar',
+ 'thumbnail': r're:^https?://.+\.png',
+ 'license': 'Standard License',
+ 'uploader': 'CapsCtrl',
+ 'uploader_id': 'capsctrl',
+ 'like_count': int,
+ 'comment_count': int,
+ 'categories': ['Samples', 'Guitar', 'Electric guitar'],
+ },
+ }, {
+ 'url': 'https://samplefocus.com/samples/dababy-style-bass-808',
+ 'only_matching': True
+ }, {
+ 'url': 'https://samplefocus.com/samples/young-chop-kick',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ sample_id = self._search_regex(
+ r'<input[^>]+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P<id>\d+)',
+ webpage, 'sample id', group='id')
+
+ title = self._og_search_title(webpage, fatal=False) or self._html_search_regex(
+ r'<h1>(.+?)</h1>', webpage, 'title')
+
+ mp3_url = self._search_regex(
+ r'<input[^>]+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P<url>(?:(?!\2).)+)',
+ webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex(
+ r'<meta[^>]+itemprop=(["\'])contentUrl\1[^>]*>',
+ webpage, 'mp3 url', group=0))['content']
+
+ thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex(
+ r'<img[^>]+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P<url>(?:(?!\1).)+)',
+ webpage, 'mp3', fatal=False, group='url')
+
+ comments = []
+ for author_id, author, body in re.findall(r'(?s)<p[^>]+class="comment-author"><a[^>]+href="/users/([^"]+)">([^"]+)</a>.+?<p[^>]+class="comment-body">([^>]+)</p>', webpage):
+ comments.append({
+ 'author': author,
+ 'author_id': author_id,
+ 'text': body,
+ })
+
+ uploader_id = uploader = None
+ mobj = re.search(r'>By <a[^>]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage)
+ if mobj:
+ uploader_id, uploader = mobj.groups()
+
+ breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage)
+ categories = []
+ if breadcrumb:
+ for _, name in re.findall(r'<span[^>]+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb):
+ categories.append(name)
+
+ def extract_count(klass):
+ return int_or_none(self._html_search_regex(
+ r'<span[^>]+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass,
+ webpage, klass, fatal=False))
+
+ return {
+ 'id': sample_id,
+ 'title': title,
+ 'url': mp3_url,
+ 'display_id': display_id,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'license': self._html_search_regex(
+ r'<a[^>]+href=(["\'])/license\1[^>]*>(?P<license>[^<]+)<',
+ webpage, 'license', fatal=False, group='license'),
+ 'uploader_id': uploader_id,
+ 'like_count': extract_count('sample-%s-favorites' % sample_id),
+ 'comment_count': extract_count('comments'),
+ 'comments': comments,
+ 'categories': categories,
+ }
diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py
index f722528cd..0a806ee4e 100644
--- a/youtube_dl/extractor/sbs.py
+++ b/youtube_dl/extractor/sbs.py
@@ -10,7 +10,7 @@ from ..utils import (
class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
- _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)'
_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
@@ -43,6 +43,9 @@ class SBSIE(InfoExtractor):
}, {
'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866',
'only_matching': True,
+ }, {
+ 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py
index b5e76c9af..0afdc1715 100644
--- a/youtube_dl/extractor/screencastomatic.py
+++ b/youtube_dl/extractor/screencastomatic.py
@@ -2,12 +2,18 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+ get_element_by_class,
+ int_or_none,
+ remove_start,
+ strip_or_none,
+ unified_strdate,
+)
class ScreencastOMaticIE(InfoExtractor):
- _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P<id>[0-9a-zA-Z]+)'
- _TEST = {
+ _VALID_URL = r'https?://screencast-o-matic\.com/(?:(?:watch|player)/|embed\?.*?\bsc=)(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [{
'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl',
'md5': '483583cb80d92588f15ccbedd90f0c18',
'info_dict': {
@@ -16,22 +22,30 @@ class ScreencastOMaticIE(InfoExtractor):
'title': 'Welcome to 3-4 Philosophy @ DECV!',
'thumbnail': r're:^https?://.*\.jpg$',
'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.',
- 'duration': 369.163,
+ 'duration': 369,
+ 'upload_date': '20141216',
}
- }
+ }, {
+ 'url': 'http://screencast-o-matic.com/player/c2lD3BeOPl',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://screencast-o-matic.com/embed?ff=true&sc=cbV2r4Q5TL&fromPH=true&a=1',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- jwplayer_data = self._parse_json(
- self._search_regex(
- r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'),
- video_id, transform_source=js_to_json)
-
- info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
- info_dict.update({
- 'title': self._og_search_title(webpage),
- 'description': self._og_search_description(webpage),
+ webpage = self._download_webpage(
+ 'https://screencast-o-matic.com/player/' + video_id, video_id)
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+ info.update({
+ 'id': video_id,
+ 'title': get_element_by_class('overlayTitle', webpage),
+ 'description': strip_or_none(get_element_by_class('overlayDescription', webpage)) or None,
+ 'duration': int_or_none(self._search_regex(
+ r'player\.duration\s*=\s*function\(\)\s*{\s*return\s+(\d+);\s*};',
+ webpage, 'duration', default=None)),
+ 'upload_date': unified_strdate(remove_start(
+ get_element_by_class('overlayPublished', webpage), 'Published: ')),
})
- return info_dict
+ return info
diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
index 5c2a6206b..88b938e05 100644
--- a/youtube_dl/extractor/shahid.py
+++ b/youtube_dl/extractor/shahid.py
@@ -21,6 +21,7 @@ from ..utils import (
class ShahidBaseIE(AWSIE):
_AWS_PROXY_HOST = 'api2.shahid.net'
_AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh'
+ _VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/'
def _handle_error(self, e):
fail_data = self._parse_json(
@@ -49,15 +50,18 @@ class ShahidBaseIE(AWSIE):
class ShahidIE(ShahidBaseIE):
_NETRC_MACHINE = 'shahid'
- _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
+ _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
_TESTS = [{
- 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286',
+ 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924',
'info_dict': {
- 'id': '275286',
+ 'id': '816924',
'ext': 'mp4',
- 'title': 'مجلس الشباب الموسم 1 كليب 1',
- 'timestamp': 1506988800,
- 'upload_date': '20171003',
+ 'title': 'متحف الدحيح الموسم 1 كليب 1',
+ 'timestamp': 1602806400,
+ 'upload_date': '20201016',
+ 'description': 'برومو',
+ 'duration': 22,
+ 'categories': ['كوميديا'],
},
'params': {
# m3u8 download
@@ -70,6 +74,9 @@ class ShahidIE(ShahidBaseIE):
# shahid plus subscriber only
'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511',
'only_matching': True
+ }, {
+ 'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319',
+ 'only_matching': True
}]
def _real_initialize(self):
@@ -109,12 +116,15 @@ class ShahidIE(ShahidBaseIE):
page_type = 'episode'
playout = self._call_api(
- 'playout/url/' + video_id, video_id)['playout']
+ 'playout/new/url/' + video_id, video_id)['playout']
if playout.get('drm'):
raise ExtractorError('This video is DRM protected.', expected=True)
- formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4')
+ formats = self._extract_m3u8_formats(re.sub(
+ # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html
+ r'aws\.manifestfilter=[\w:;,-]+&?',
+ '', playout['url']), video_id, 'mp4')
self._sort_formats(formats)
# video = self._call_api(
@@ -162,7 +172,7 @@ class ShahidIE(ShahidBaseIE):
class ShahidShowIE(ShahidBaseIE):
- _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
+ _VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187',
'info_dict': {
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index 02295d1a4..93ab2a167 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -86,10 +86,10 @@ class SharedIE(SharedBaseIE):
class VivoIE(SharedBaseIE):
IE_DESC = 'vivo.sx'
- _VALID_URL = r'https?://vivo\.sx/(?P<id>[\da-z]{10})'
+ _VALID_URL = r'https?://vivo\.s[xt]/(?P<id>[\da-z]{10})'
_FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed'
- _TEST = {
+ _TESTS = [{
'url': 'http://vivo.sx/d7ddda0e78',
'md5': '15b3af41be0b4fe01f4df075c2678b2c',
'info_dict': {
@@ -98,7 +98,10 @@ class VivoIE(SharedBaseIE):
'title': 'Chicken',
'filesize': 515659,
},
- }
+ }, {
+ 'url': 'http://vivo.st/d7ddda0e78',
+ 'only_matching': True,
+ }]
def _extract_title(self, webpage):
title = self._html_search_regex(
diff --git a/youtube_dl/extractor/simplecast.py b/youtube_dl/extractor/simplecast.py
new file mode 100644
index 000000000..2d0b3c06d
--- /dev/null
+++ b/youtube_dl/extractor/simplecast.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_podcast_url,
+ int_or_none,
+ parse_iso8601,
+ strip_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class SimplecastBaseIE(InfoExtractor):
+ _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
+ _API_BASE = 'https://api.simplecast.com/'
+
+ def _call_api(self, path_tmpl, video_id):
+ return self._download_json(
+ self._API_BASE + path_tmpl % video_id, video_id)
+
+ def _call_search_api(self, resource, resource_id, resource_url):
+ return self._download_json(
+ 'https://api.simplecast.com/%ss/search' % resource, resource_id,
+ data=urlencode_postdata({'url': resource_url}))
+
+ def _parse_episode(self, episode):
+ episode_id = episode['id']
+ title = episode['title'].strip()
+ audio_file = episode.get('audio_file') or {}
+ audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url']
+
+ season = episode.get('season') or {}
+ season_href = season.get('href')
+ season_id = None
+ if season_href:
+ season_id = self._search_regex(
+ r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX,
+ season_href, 'season id', default=None)
+
+ webpage_url = episode.get('episode_url')
+ channel_url = None
+ if webpage_url:
+ channel_url = self._search_regex(
+ r'(https?://[^/]+\.simplecast\.com)',
+ webpage_url, 'channel url', default=None)
+
+ return {
+ 'id': episode_id,
+ 'display_id': episode.get('slug'),
+ 'title': title,
+ 'url': clean_podcast_url(audio_file_url),
+ 'webpage_url': webpage_url,
+ 'channel_url': channel_url,
+ 'series': try_get(episode, lambda x: x['podcast']['title']),
+ 'season_number': int_or_none(season.get('number')),
+ 'season_id': season_id,
+ 'thumbnail': episode.get('image_url'),
+ 'episode_id': episode_id,
+ 'episode_number': int_or_none(episode.get('number')),
+ 'description': strip_or_none(episode.get('description')),
+ 'timestamp': parse_iso8601(episode.get('published_at')),
+ 'duration': int_or_none(episode.get('duration')),
+ 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')),
+ }
+
+
+class SimplecastIE(SimplecastBaseIE):
+ IE_NAME = 'simplecast'
+ _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
+ _COMMON_TEST_INFO = {
+ 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
+ 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'ext': 'mp3',
+ 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
+ 'episode_number': 1,
+ 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'description': 'md5:34752789d3d2702e2d2c975fbd14f357',
+ 'season_number': 1,
+ 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
+ 'series': 'The RE:BIND.io Podcast',
+ 'duration': 5343,
+ 'timestamp': 1580979475,
+ 'upload_date': '20200206',
+ 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+ 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$',
+ }
+ _TESTS = [{
+ 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'md5': '8c93be7be54251bf29ee97464eabe61c',
+ 'info_dict': _COMMON_TEST_INFO,
+ }, {
+ 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'''(?x)<iframe[^>]+src=["\']
+ (
+ https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
+ player\.simplecast\.com/%s
+ ))''' % SimplecastBaseIE._UUID_REGEX, webpage)
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ episode = self._call_api('episodes/%s', episode_id)
+ return self._parse_episode(episode)
+
+
+class SimplecastEpisodeIE(SimplecastBaseIE):
+ IE_NAME = 'simplecast:episode'
+ _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)'
+ _TEST = {
+ 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+ 'md5': '8c93be7be54251bf29ee97464eabe61c',
+ 'info_dict': SimplecastIE._COMMON_TEST_INFO,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ episode = self._call_search_api(
+ 'episode', mobj.group(1), mobj.group(0))
+ return self._parse_episode(episode)
+
+
+class SimplecastPodcastIE(SimplecastBaseIE):
+ IE_NAME = 'simplecast:podcast'
+ _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://the-re-bind-io-podcast.simplecast.com',
+ 'playlist_mincount': 33,
+ 'info_dict': {
+ 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
+ 'title': 'The RE:BIND.io Podcast',
+ },
+ }, {
+ 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ subdomain = self._match_id(url)
+ site = self._call_search_api('site', subdomain, url)
+ podcast = site['podcast']
+ podcast_id = podcast['id']
+ podcast_title = podcast.get('title')
+
+ def entries():
+ episodes = self._call_api('podcasts/%s/episodes', podcast_id)
+ for episode in (episodes.get('collection') or []):
+ info = self._parse_episode(episode)
+ info['series'] = podcast_title
+ yield info
+
+ return self.playlist_result(entries(), podcast_id, podcast_title)
diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py
index da75a43a7..0774da06e 100644
--- a/youtube_dl/extractor/southpark.py
+++ b/youtube_dl/extractor/southpark.py
@@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor
class SouthParkIE(MTVServicesInfoExtractor):
IE_NAME = 'southpark.cc.com'
- _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
- _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
+ _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
_TESTS = [{
'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured',
@@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor):
}, {
'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1',
'only_matching': True,
+ }, {
+ 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1',
+ 'only_matching': True,
}]
+ def _get_feed_query(self, uri):
+ return {
+ 'accountOverride': 'intl.mtvi.com',
+ 'arcEp': 'shared.southpark.global',
+ 'ep': '90877963',
+ 'imageEp': 'shared.southpark.global',
+ 'mgid': uri,
+ }
+
class SouthParkEsIE(SouthParkIE):
IE_NAME = 'southpark.cc.com:español'
diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py
index 4c5e3f7c2..5805f3d44 100644
--- a/youtube_dl/extractor/spike.py
+++ b/youtube_dl/extractor/spike.py
@@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor):
_FEED_URL = 'http://www.bellator.com/feeds/mrss/'
_GEO_COUNTRIES = ['US']
- def _extract_mgid(self, webpage):
- return self._extract_triforce_mgid(webpage)
-
class ParamountNetworkIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
@@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor):
def _get_feed_query(self, uri):
return {
'arcEp': 'paramountnetwork.com',
+ 'imageEp': 'paramountnetwork.com',
'mgid': uri,
}
-
- def _extract_mgid(self, webpage):
- root_data = self._parse_json(self._search_regex(
- r'window\.__DATA__\s*=\s*({.+})',
- webpage, 'data'), None)
-
- def find_sub_data(data, data_type):
- return next(c for c in data['children'] if c.get('type') == data_type)
-
- c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer')
- return c['props']['media']['video']['config']['uri']
diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py
index 378fc7568..3e497a939 100644
--- a/youtube_dl/extractor/sportdeutschland.py
+++ b/youtube_dl/extractor/sportdeutschland.py
@@ -1,82 +1,105 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
+ clean_html,
+ float_or_none,
+ int_or_none,
parse_iso8601,
- sanitized_Request,
+ strip_or_none,
+ try_get,
)
class SportDeutschlandIE(InfoExtractor):
- _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])'
+ _VALID_URL = r'https?://sportdeutschland\.tv/(?P<id>(?:[^/]+/)?[^?#/&]+)'
_TESTS = [{
'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',
'info_dict': {
- 'id': 're-live-deutsche-meisterschaften-2020-halbfinals',
+ 'id': '5318cac0275701382770543d7edaf0a0',
'ext': 'mp4',
- 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals',
- 'categories': ['Badminton-Deutschland'],
- 'view_count': int,
- 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
- 'timestamp': int,
- 'upload_date': '20200201',
- 'description': 're:.*', # meaningless description for THIS video
+ 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1',
+ 'duration': 16106.36,
},
+ 'params': {
+ 'noplaylist': True,
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',
+ 'info_dict': {
+ 'id': 'c6e2fdd01f63013854c47054d2ab776f',
+ 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals',
+ 'description': 'md5:5263ff4c31c04bb780c9f91130b48530',
+ 'duration': 31397,
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- sport_id = mobj.group('sport')
-
- api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % (
- sport_id, video_id)
- req = sanitized_Request(api_url, headers={
- 'Accept': 'application/vnd.vidibus.v2.html+json',
- 'Referer': url,
- })
- data = self._download_json(req, video_id)
-
+ display_id = self._match_id(url)
+ data = self._download_json(
+ 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id,
+ display_id, query={'access_token': 'true'})
asset = data['asset']
- categories = [data['section']['title']]
-
- formats = []
- smil_url = asset['video']
- if '.smil' in smil_url:
- m3u8_url = smil_url.replace('.smil', '.m3u8')
- formats.extend(
- self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4'))
+ title = (asset.get('title') or asset['label']).strip()
+ asset_id = asset.get('id') or asset.get('uuid')
+ info = {
+ 'id': asset_id,
+ 'title': title,
+ 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'),
+ 'duration': int_or_none(asset.get('seconds')),
+ }
+ videos = asset.get('videos') or []
+ if len(videos) > 1:
+ playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0]
+ if playlist_id:
+ if self._downloader.params.get('noplaylist'):
+ videos = [videos[int(playlist_id)]]
+ self.to_screen('Downloading just a single video because of --no-playlist')
+ else:
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id)
- smil_doc = self._download_xml(
- smil_url, video_id, note='Downloading SMIL metadata')
- base_url_el = smil_doc.find('./head/meta')
- if base_url_el:
- base_url = base_url_el.attrib['base']
- formats.extend([{
- 'format_id': 'rmtp',
- 'url': base_url if base_url_el else n.attrib['src'],
- 'play_path': n.attrib['src'],
- 'ext': 'flv',
- 'preference': -100,
- 'format_note': 'Seems to fail at example stream',
- } for n in smil_doc.findall('./body/video')])
+ def entries():
+ for i, video in enumerate(videos, 1):
+ video_id = video.get('uuid')
+ video_url = video.get('url')
+ if not (video_id and video_url):
+ continue
+ formats = self._extract_m3u8_formats(
+ video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False)
+ if not formats:
+ continue
+ yield {
+ 'id': video_id,
+ 'formats': formats,
+ 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i),
+ 'duration': float_or_none(video.get('duration')),
+ }
+ info.update({
+ '_type': 'multi_video',
+ 'entries': entries(),
+ })
else:
- formats.append({'url': smil_url})
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'formats': formats,
- 'title': asset['title'],
- 'thumbnail': asset.get('image'),
- 'description': asset.get('teaser'),
- 'duration': asset.get('duration'),
- 'categories': categories,
- 'view_count': asset.get('views'),
- 'rtmp_live': asset.get('live'),
- 'timestamp': parse_iso8601(asset.get('date')),
- }
+ formats = self._extract_m3u8_formats(
+ videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4')
+ section_title = strip_or_none(try_get(data, lambda x: x['section']['title']))
+ info.update({
+ 'formats': formats,
+ 'display_id': asset.get('permalink'),
+ 'thumbnail': try_get(asset, lambda x: x['images'][0]),
+ 'categories': [section_title] if section_title else None,
+ 'view_count': int_or_none(asset.get('views')),
+ 'is_live': asset.get('is_live') is True,
+ 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')),
+ })
+ return info
diff --git a/youtube_dl/extractor/spotify.py b/youtube_dl/extractor/spotify.py
new file mode 100644
index 000000000..826f98cff
--- /dev/null
+++ b/youtube_dl/extractor/spotify.py
@@ -0,0 +1,156 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_podcast_url,
+ float_or_none,
+ int_or_none,
+ strip_or_none,
+ try_get,
+ unified_strdate,
+)
+
+
+class SpotifyBaseIE(InfoExtractor):
+ _ACCESS_TOKEN = None
+ _OPERATION_HASHES = {
+ 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf',
+ 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0',
+ 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d',
+ }
+ _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P<id>[^/?&#]+)'
+
+ def _real_initialize(self):
+ self._ACCESS_TOKEN = self._download_json(
+ 'https://open.spotify.com/get_access_token', None)['accessToken']
+
+ def _call_api(self, operation, video_id, variables):
+ return self._download_json(
+ 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={
+ 'operationName': 'query' + operation,
+ 'variables': json.dumps(variables),
+ 'extensions': json.dumps({
+ 'persistedQuery': {
+ 'sha256Hash': self._OPERATION_HASHES[operation],
+ },
+ })
+ }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data']
+
+ def _extract_episode(self, episode, series):
+ episode_id = episode['id']
+ title = episode['name'].strip()
+
+ formats = []
+ audio_preview = episode.get('audioPreview') or {}
+ audio_preview_url = audio_preview.get('url')
+ if audio_preview_url:
+ f = {
+ 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'),
+ 'vcodec': 'none',
+ }
+ audio_preview_format = audio_preview.get('format')
+ if audio_preview_format:
+ f['format_id'] = audio_preview_format
+ mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format)
+ if mobj:
+ f.update({
+ 'abr': int(mobj.group(2)),
+ 'ext': mobj.group(1).lower(),
+ })
+ formats.append(f)
+
+ for item in (try_get(episode, lambda x: x['audio']['items']) or []):
+ item_url = item.get('url')
+ if not (item_url and item.get('externallyHosted')):
+ continue
+ formats.append({
+ 'url': clean_podcast_url(item_url),
+ 'vcodec': 'none',
+ })
+
+ thumbnails = []
+ for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []):
+ source_url = source.get('url')
+ if not source_url:
+ continue
+ thumbnails.append({
+ 'url': source_url,
+ 'width': int_or_none(source.get('width')),
+ 'height': int_or_none(source.get('height')),
+ })
+
+ return {
+ 'id': episode_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': strip_or_none(episode.get('description')),
+ 'duration': float_or_none(try_get(
+ episode, lambda x: x['duration']['totalMilliseconds']), 1000),
+ 'release_date': unified_strdate(try_get(
+ episode, lambda x: x['releaseDate']['isoString'])),
+ 'series': series,
+ }
+
+
+class SpotifyIE(SpotifyBaseIE):
+ IE_NAME = 'spotify'
+ _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode'
+ _TEST = {
+ 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo',
+ 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b',
+ 'info_dict': {
+ 'id': '4Z7GAJ50bgctf6uclHlWKo',
+ 'ext': 'mp3',
+ 'title': 'From the archive: Why time management is ruining our lives',
+ 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935',
+ 'duration': 2083.605,
+ 'release_date': '20201217',
+ 'series': "The Guardian's Audio Long Reads",
+ }
+ }
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ episode = self._call_api('Episode', episode_id, {
+ 'uri': 'spotify:episode:' + episode_id
+ })['episode']
+ return self._extract_episode(
+ episode, try_get(episode, lambda x: x['podcast']['name']))
+
+
+class SpotifyShowIE(SpotifyBaseIE):
+ IE_NAME = 'spotify:show'
+ _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show'
+ _TEST = {
+ 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M',
+ 'info_dict': {
+ 'id': '4PM9Ke6l66IRNpottHKV9M',
+ 'title': 'The Story from the Guardian',
+ 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories',
+ },
+ 'playlist_mincount': 36,
+ }
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ podcast = self._call_api('ShowEpisodes', show_id, {
+ 'limit': 1000000000,
+ 'offset': 0,
+ 'uri': 'spotify:show:' + show_id,
+ })['podcast']
+ podcast_name = podcast.get('name')
+
+ entries = []
+ for item in (try_get(podcast, lambda x: x['episodes']['items']) or []):
+ episode = item.get('episode')
+ if not episode:
+ continue
+ entries.append(self._extract_episode(episode, podcast_name))
+
+ return self.playlist_result(
+ entries, show_id, podcast_name, podcast.get('description'))
diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py
index f63a1359a..ac018e740 100644
--- a/youtube_dl/extractor/srgssr.py
+++ b/youtube_dl/extractor/srgssr.py
@@ -4,16 +4,32 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
+ float_or_none,
+ int_or_none,
parse_iso8601,
qualities,
+ try_get,
)
class SRGSSRIE(InfoExtractor):
- _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|
+ srgssr
+ ):
+ (?P<bu>
+ srf|rts|rsi|rtr|swi
+ ):(?:[^:]+:)?
+ (?P<type>
+ video|audio
+ ):
+ (?P<id>
+ [0-9a-f\-]{36}|\d+
+ )
+ '''
_GEO_BYPASS = False
_GEO_COUNTRIES = ['CH']
@@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor):
'LEGAL': 'The video cannot be transmitted for legal reasons.',
'STARTDATE': 'This video is not yet available. Please try again later.',
}
+ _DEFAULT_LANGUAGE_CODES = {
+ 'srf': 'de',
+ 'rts': 'fr',
+ 'rsi': 'it',
+ 'rtr': 'rm',
+ 'swi': 'en',
+ }
def _get_tokenized_src(self, url, video_id, format_id):
- sp = compat_urllib_parse_urlparse(url).path.split('/')
token = self._download_json(
- 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]),
+ 'http://tp.srgssr.ch/akahd/token?acl=*',
video_id, 'Downloading %s token' % format_id, fatal=False) or {}
- auth_params = token.get('token', {}).get('authparams')
+ auth_params = try_get(token, lambda x: x['token']['authparams'])
if auth_params:
- url += '?' + auth_params
+ url += ('?' if '?' not in url else '&') + auth_params
return url
- def get_media_data(self, bu, media_type, media_id):
- media_data = self._download_json(
- 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id),
- media_id)[media_type.capitalize()]
-
- if media_data.get('block') and media_data['block'] in self._ERRORS:
- message = self._ERRORS[media_data['block']]
- if media_data['block'] == 'GEOBLOCK':
+ def _get_media_data(self, bu, media_type, media_id):
+ query = {'onlyChapters': True} if media_type == 'video' else {}
+ full_media_data = self._download_json(
+ 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json'
+ % (bu, media_type, media_id),
+ media_id, query=query)['chapterList']
+ try:
+ media_data = next(
+ x for x in full_media_data if x.get('id') == media_id)
+ except StopIteration:
+ raise ExtractorError('No media information found')
+
+ block_reason = media_data.get('blockReason')
+ if block_reason and block_reason in self._ERRORS:
+ message = self._ERRORS[block_reason]
+ if block_reason == 'GEOBLOCK':
self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
@@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor):
def _real_extract(self, url):
bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
+ media_data = self._get_media_data(bu, media_type, media_id)
+ title = media_data['title']
- media_data = self.get_media_data(bu, media_type, media_id)
-
- metadata = media_data['AssetMetadatas']['AssetMetadata'][0]
- title = metadata['title']
- description = metadata.get('description')
- created_date = media_data.get('createdDate') or metadata.get('createdDate')
- timestamp = parse_iso8601(created_date)
-
- thumbnails = [{
- 'id': image.get('id'),
- 'url': image['url'],
- } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])]
-
- preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD'])
formats = []
- for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []):
- protocol = source.get('@protocol')
- for asset in source['url']:
- asset_url = asset['text']
- quality = asset['@quality']
- format_id = '%s-%s' % (protocol, quality)
- if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'):
- asset_url = self._get_tokenized_src(asset_url, media_id, format_id)
- if protocol.startswith('HTTP-HDS'):
- formats.extend(self._extract_f4m_formats(
- asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0',
- media_id, f4m_id=format_id, fatal=False))
- elif protocol.startswith('HTTP-HLS'):
- formats.extend(self._extract_m3u8_formats(
- asset_url, media_id, 'mp4', 'm3u8_native',
- m3u8_id=format_id, fatal=False))
- else:
- formats.append({
- 'format_id': format_id,
- 'url': asset_url,
- 'preference': preference(quality),
- 'ext': 'flv' if protocol == 'RTMP' else None,
- })
+ q = qualities(['SD', 'HD'])
+ for source in (media_data.get('resourceList') or []):
+ format_url = source.get('url')
+ if not format_url:
+ continue
+ protocol = source.get('protocol')
+ quality = source.get('quality')
+ format_id = []
+ for e in (protocol, source.get('encoding'), quality):
+ if e:
+ format_id.append(e)
+ format_id = '-'.join(format_id)
+
+ if protocol in ('HDS', 'HLS'):
+ if source.get('tokenType') == 'AKAMAI':
+ format_url = self._get_tokenized_src(
+ format_url, media_id, format_id)
+ formats.extend(self._extract_akamai_formats(
+ format_url, media_id))
+ elif protocol == 'HLS':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, media_id, 'mp4', 'm3u8_native',
+ m3u8_id=format_id, fatal=False))
+ elif protocol in ('HTTP', 'HTTPS'):
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ 'quality': q(quality),
+ })
+
+ # This is needed because for audio medias the podcast url is usually
+ # always included, even if is only an audio segment and not the
+ # whole episode.
+ if int_or_none(media_data.get('position')) == 0:
+ for p in ('S', 'H'):
+ podcast_url = media_data.get('podcast%sdUrl' % p)
+ if not podcast_url:
+ continue
+ quality = p + 'D'
+ formats.append({
+ 'format_id': 'PODCAST-' + quality,
+ 'url': podcast_url,
+ 'quality': q(quality),
+ })
self._sort_formats(formats)
+ subtitles = {}
+ if media_type == 'video':
+ for sub in (media_data.get('subtitleList') or []):
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu]
+ subtitles.setdefault(lang, []).append({
+ 'url': sub_url,
+ })
+
return {
'id': media_id,
'title': title,
- 'description': description,
- 'timestamp': timestamp,
- 'thumbnails': thumbnails,
+ 'description': media_data.get('description'),
+ 'timestamp': parse_iso8601(media_data.get('date')),
+ 'thumbnail': media_data.get('imageUrl'),
+ 'duration': float_or_none(media_data.get('duration'), 1000),
+ 'subtitles': subtitles,
'formats': formats,
}
@@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
- 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3',
+ 'md5': '6db2226ba97f62ad42ce09783680046c',
'info_dict': {
'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'ext': 'mp4',
'upload_date': '20130701',
'title': 'Snowden beantragt Asyl in Russland',
- 'timestamp': 1372713995,
- }
- }, {
- # No Speichern (Save) button
- 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa',
- 'md5': '0a274ce38fda48c53c01890651985bc6',
- 'info_dict': {
- 'id': '677f5829-e473-4823-ac83-a1087fe97faa',
- 'ext': 'flv',
- 'upload_date': '20130710',
- 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive',
- 'description': 'md5:88604432b60d5a38787f152dec89cd56',
- 'timestamp': 1373493600,
+ 'timestamp': 1372708215,
+ 'duration': 113.827,
+ 'thumbnail': r're:^https?://.*1383719781\.png$',
},
+ 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc',
'info_dict': {
@@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor):
'ext': 'mp3',
'upload_date': '20151013',
'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem',
- 'timestamp': 1444750398,
+ 'timestamp': 1444709160,
+ 'duration': 336.816,
},
'params': {
# rtmp download
@@ -159,20 +203,33 @@ class SRGSSRPlayIE(InfoExtractor):
'id': '6348260',
'display_id': '6348260',
'ext': 'mp4',
- 'duration': 1796,
+ 'duration': 1796.76,
'title': 'Le 19h30',
- 'description': '',
- 'uploader': '19h30',
'upload_date': '20141201',
'timestamp': 1417458600,
'thumbnail': r're:^https?://.*\.image',
- 'view_count': int,
},
'params': {
# m3u8 download
'skip_download': True,
}
}, {
+ 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270',
+ 'info_dict': {
+ 'id': '42960270',
+ 'ext': 'mp4',
+ 'title': 'Why people were against tax reforms',
+ 'description': 'md5:7ac442c558e9630e947427469c4b824d',
+ 'duration': 94.0,
+ 'upload_date': '20170215',
+ 'timestamp': 1487173560,
+ 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964',
+ 'subtitles': 'count:9',
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01',
'only_matching': True,
}, {
@@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor):
}, {
'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260',
'only_matching': True,
+ }, {
+ # audio segment, has podcastSdUrl of the full episode
+ 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor):
bu = mobj.group('bu')
media_type = mobj.group('type') or mobj.group('type_2')
media_id = mobj.group('id')
- # other info can be extracted from url + '&layout=json'
return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR')
diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py
new file mode 100644
index 000000000..9c698626f
--- /dev/null
+++ b/youtube_dl/extractor/storyfire.py
@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import functools
+
+from .common import InfoExtractor
+from ..utils import (
+ # HEADRequest,
+ int_or_none,
+ OnDemandPagedList,
+ smuggle_url,
+)
+
+
+class StoryFireBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/'
+
+ def _call_api(self, path, video_id, resource, query=None):
+ return self._download_json(
+ 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id,
+ 'Downloading %s JSON metadata' % resource, query=query)
+
+ def _parse_video(self, video):
+ title = video['title']
+ vimeo_id = self._search_regex(
+ r'https?://player\.vimeo\.com/external/(\d+)',
+ video['vimeoVideoURL'], 'vimeo id')
+
+ # video_url = self._request_webpage(
+ # HEADRequest(video['vimeoVideoURL']), video_id).geturl()
+ # formats = []
+ # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
+ # formats.extend(self._extract_m3u8_formats(
+ # v_url, video_id, 'mp4', 'm3u8_native',
+ # m3u8_id='hls' + suffix, fatal=False))
+ # formats.extend(self._extract_mpd_formats(
+ # v_url.replace('.m3u8', '.mpd'), video_id,
+ # mpd_id='dash' + suffix, fatal=False))
+ # self._sort_formats(formats)
+
+ uploader_id = video.get('hostID')
+
+ return {
+ '_type': 'url_transparent',
+ 'id': vimeo_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'url': smuggle_url(
+ 'https://player.vimeo.com/video/' + vimeo_id, {
+ 'http_headers': {
+ 'Referer': 'https://storyfire.com/',
+ }
+ }),
+ # 'formats': formats,
+ 'thumbnail': video.get('storyImage'),
+ 'view_count': int_or_none(video.get('views')),
+ 'like_count': int_or_none(video.get('likesCount')),
+ 'comment_count': int_or_none(video.get('commentsCount')),
+ 'duration': int_or_none(video.get('videoDuration')),
+ 'timestamp': int_or_none(video.get('publishDate')),
+ 'uploader': video.get('username'),
+ 'uploader_id': uploader_id,
+ 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
+ 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
+ }
+
+
+class StoryFireIE(StoryFireBaseIE):
+ _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})'
+ _TEST = {
+ 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
+ 'md5': 'caec54b9e4621186d6079c7ec100c1eb',
+ 'info_dict': {
+ 'id': '378954662',
+ 'ext': 'mp4',
+ 'title': 'Buzzfeed Teaches You About Memes',
+ 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+ 'timestamp': 1576129028,
+ 'description': 'md5:0b4e28021548e144bed69bb7539e62ea',
+ 'uploader': 'whang!',
+ 'upload_date': '20191212',
+ 'duration': 418,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Unable to download JSON metadata']
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video = self._call_api(
+ 'generic/video-detail', video_id, 'video')['video']
+ return self._parse_video(video)
+
+
+class StoryFireUserIE(StoryFireBaseIE):
+ _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video'
+ _TEST = {
+ 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
+ 'info_dict': {
+ 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
+ },
+ 'playlist_mincount': 151,
+ }
+ _PAGE_SIZE = 20
+
+ def _fetch_page(self, user_id, page):
+ videos = self._call_api(
+ 'publicVideos', user_id, 'page %d' % (page + 1), {
+ 'skip': page * self._PAGE_SIZE,
+ })['videos']
+ for video in videos:
+ yield self._parse_video(video)
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, user_id), self._PAGE_SIZE)
+ return self.playlist_result(entries, user_id)
+
+
+class StoryFireSeriesIE(StoryFireBaseIE):
+ _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
+ 'info_dict': {
+ 'id': '-Lq6MsuIHLODO6d2dDkr',
+ },
+ 'playlist_mincount': 13,
+ }, {
+ 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
+ 'info_dict': {
+ 'id': 'the_mortal_one',
+ },
+ 'playlist_count': 0,
+ }]
+
+ def _extract_videos(self, stories):
+ for story in stories.values():
+ if story.get('hasVideo'):
+ yield self._parse_video(story)
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ stories = self._call_api(
+ 'seriesStories', series_id, 'series stories')
+ return self.playlist_result(self._extract_videos(stories), series_id)
diff --git a/youtube_dl/extractor/stretchinternet.py b/youtube_dl/extractor/stretchinternet.py
index 4dbead2ba..ec08eae55 100644
--- a/youtube_dl/extractor/stretchinternet.py
+++ b/youtube_dl/extractor/stretchinternet.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
class StretchInternetIE(InfoExtractor):
@@ -11,22 +10,28 @@ class StretchInternetIE(InfoExtractor):
'info_dict': {
'id': '573272',
'ext': 'mp4',
- 'title': 'University of Mary Wrestling vs. Upper Iowa',
- 'timestamp': 1575668361,
- 'upload_date': '20191206',
+ 'title': 'UNIVERSITY OF MARY WRESTLING VS UPPER IOWA',
+ # 'timestamp': 1575668361,
+ # 'upload_date': '20191206',
+ 'uploader_id': '99997',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
+ media_url = self._download_json(
+ 'https://core.stretchlive.com/trinity/event/tcg/' + video_id,
+ video_id)[0]['media'][0]['url']
event = self._download_json(
- 'https://api.stretchinternet.com/trinity/event/tcg/' + video_id,
- video_id)[0]
+ 'https://neo-client.stretchinternet.com/portal-ws/getEvent.json',
+ video_id, query={'eventID': video_id, 'token': 'asdf'})['event']
return {
'id': video_id,
'title': event['title'],
- 'timestamp': int_or_none(event.get('dateCreated'), 1000),
- 'url': 'https://' + event['media'][0]['url'],
+ # TODO: parse US timezone abbreviations
+ # 'timestamp': event.get('dateTimeString'),
+ 'url': 'https://' + media_url,
+ 'uploader_id': event.get('ownerID'),
}
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
index a0b6ef4db..a5bb6daa7 100644
--- a/youtube_dl/extractor/svt.py
+++ b/youtube_dl/extractor/svt.py
@@ -146,18 +146,19 @@ class SVTPlayIE(SVTPlayBaseIE):
)
(?P<svt_id>[^/?#&]+)|
https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
+ (?:.*?(?:modalId|id)=(?P<modal_id>[\da-zA-Z-]+))?
)
'''
_TESTS = [{
- 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen',
+ 'url': 'https://www.svtplay.se/video/30479064',
'md5': '2382036fd6f8c994856c323fe51c426e',
'info_dict': {
- 'id': 'jNwpV9P',
+ 'id': '8zVbDPA',
'ext': 'mp4',
- 'title': 'Det här är himlen',
- 'timestamp': 1586044800,
- 'upload_date': '20200405',
- 'duration': 3515,
+ 'title': 'Designdrömmar i Stenungsund',
+ 'timestamp': 1615770000,
+ 'upload_date': '20210315',
+ 'duration': 3519,
'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
'age_limit': 0,
'subtitles': {
@@ -174,6 +175,12 @@ class SVTPlayIE(SVTPlayBaseIE):
'skip_download': True,
},
}, {
+ 'url': 'https://www.svtplay.se/video/30479064/husdrommar/husdrommar-sasong-8-designdrommar-i-stenungsund?modalId=8zVbDPA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.svtplay.se/video/30684086/rapport/rapport-24-apr-18-00-7?id=e72gVpa',
+ 'only_matching': True,
+ }, {
# geo restricted to Sweden
'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
'only_matching': True,
@@ -219,7 +226,8 @@ class SVTPlayIE(SVTPlayBaseIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id, svt_id = mobj.group('id', 'svt_id')
+ video_id = mobj.group('id')
+ svt_id = mobj.group('svt_id') or mobj.group('modal_id')
if svt_id:
return self._extract_by_video_id(svt_id)
@@ -254,9 +262,12 @@ class SVTPlayIE(SVTPlayBaseIE):
if not svt_id:
svt_id = self._search_regex(
(r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
+ r'<[^>]+\bdata-rt=["\']top-area-play-button["\'][^>]+\bhref=["\'][^"\']*video/%s/[^"\']*\b(?:modalId|id)=([\da-zA-Z-]+)' % re.escape(video_id),
r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)',
+ r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)',
r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"',
- r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'),
+ r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)',
+ r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'),
webpage, 'video id')
info_dict = self._extract_by_video_id(svt_id, webpage)
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 63e2455b2..f09f1a3f9 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -123,6 +123,10 @@ class TEDIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ # with own formats and private Youtube external
+ 'url': 'https://www.ted.com/talks/spencer_wells_a_family_tree_for_humanity',
+ 'only_matching': True,
}]
_NATIVE_FORMATS = {
@@ -210,16 +214,6 @@ class TEDIE(InfoExtractor):
player_talk = talk_info['player_talks'][0]
- external = player_talk.get('external')
- if isinstance(external, dict):
- service = external.get('service')
- if isinstance(service, compat_str):
- ext_url = None
- if service.lower() == 'youtube':
- ext_url = external.get('code')
-
- return self.url_result(ext_url or external['uri'])
-
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None
@@ -294,6 +288,16 @@ class TEDIE(InfoExtractor):
'vcodec': 'none',
})
+ if not formats:
+ external = player_talk.get('external')
+ if isinstance(external, dict):
+ service = external.get('service')
+ if isinstance(service, compat_str):
+ ext_url = None
+ if service.lower() == 'youtube':
+ ext_url = external.get('code')
+ return self.url_result(ext_url or external['uri'])
+
self._sort_formats(formats)
video_id = compat_str(talk_info['id'])
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py
index 55e2a0721..23c2808a1 100644
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -1,92 +1,87 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
+import re
+
from .common import InfoExtractor
-from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ try_get,
+)
class TF1IE(InfoExtractor):
- """TF1 uses the wat.tv player."""
- _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'
+ _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P<program_slug>[^/]+)/videos/(?P<id>[^/?&#]+)\.html'
_TESTS = [{
- 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
- 'info_dict': {
- 'id': '10635995',
- 'ext': 'mp4',
- 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle',
- 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
- },
- 'params': {
- # Sometimes wat serves the whole file with the --test option
- 'skip_download': True,
- },
- 'expected_warnings': ['HTTP Error 404'],
- }, {
- 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html',
+ 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html',
'info_dict': {
- 'id': 'le-grand-mysterioso-chuggington-7085291-739',
+ 'id': '13641379',
'ext': 'mp4',
- 'title': 'Le grand Mystérioso - Chuggington',
- 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.',
- 'upload_date': '20150103',
+ 'title': 'md5:f392bc52245dc5ad43771650c96fb620',
+ 'description': 'md5:a02cdb217141fb2d469d6216339b052f',
+ 'upload_date': '20190611',
+ 'timestamp': 1560273989,
+ 'duration': 1738,
+ 'series': 'Quotidien avec Yann Barthès',
+ 'tags': ['intégrale', 'quotidien', 'Replay'],
},
'params': {
# Sometimes wat serves the whole file with the --test option
'skip_download': True,
+ 'format': 'bestvideo',
},
- 'skip': 'HTTP Error 410: Gone',
}, {
'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html',
'only_matching': True,
}, {
- 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html',
- 'only_matching': True,
- }, {
'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html',
'only_matching': True,
- }, {
- 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html',
- 'info_dict': {
- 'id': '13641379',
- 'ext': 'mp4',
- 'title': 'md5:f392bc52245dc5ad43771650c96fb620',
- 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae',
- 'upload_date': '20190611',
- },
- 'params': {
- # Sometimes wat serves the whole file with the --test option
- 'skip_download': True,
- },
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- wat_id = None
+ program_slug, slug = re.match(self._VALID_URL, url).groups()
+ video = self._download_json(
+ 'https://www.tf1.fr/graphql/web', slug, query={
+ 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f',
+ 'variables': json.dumps({
+ 'programSlug': program_slug,
+ 'slug': slug,
+ })
+ })['data']['videoBySlug']
+ wat_id = video['streamId']
- data = self._parse_json(
- self._search_regex(
- r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|</script>)', webpage,
- 'data', default='{}'), video_id, fatal=False)
+ tags = []
+ for tag in (video.get('tags') or []):
+ label = tag.get('label')
+ if not label:
+ continue
+ tags.append(label)
- if data:
- try:
- wat_id = next(
- video.get('streamId')
- for key, video in data.items()
- if isinstance(video, dict)
- and video.get('slug') == video_id)
- if not isinstance(wat_id, compat_str) or not wat_id.isdigit():
- wat_id = None
- except StopIteration:
- pass
+ decoration = video.get('decoration') or {}
- if not wat_id:
- wat_id = self._html_search_regex(
- (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1',
- r'(["\']?)streamId\1\s*:\s*(["\']?)(?P<id>\d+)\2'),
- webpage, 'wat id', group='id')
+ thumbnails = []
+ for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []):
+ source_url = source.get('url')
+ if not source_url:
+ continue
+ thumbnails.append({
+ 'url': source_url,
+ 'width': int_or_none(source.get('width')),
+ })
- return self.url_result('wat:%s' % wat_id, 'Wat')
+ return {
+ '_type': 'url_transparent',
+ 'id': wat_id,
+ 'url': 'wat:' + wat_id,
+ 'title': video.get('title'),
+ 'thumbnails': thumbnails,
+ 'description': decoration.get('description'),
+ 'timestamp': parse_iso8601(video.get('date')),
+ 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])),
+ 'tags': tags,
+ 'series': decoration.get('programLabel'),
+ 'season_number': int_or_none(video.get('season')),
+ 'episode_number': int_or_none(video.get('episode')),
+ }
diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py
index f26937da1..f6d37bb9e 100644
--- a/youtube_dl/extractor/threeqsdn.py
+++ b/youtube_dl/extractor/threeqsdn.py
@@ -3,10 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
- js_to_json,
- mimetype2ext,
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ parse_iso8601,
)
@@ -15,29 +18,35 @@ class ThreeQSDNIE(InfoExtractor):
IE_DESC = '3Q SDN'
_VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
- # ondemand from http://www.philharmonie.tv/veranstaltung/26/
- 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
- 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd',
+ # https://player.3qsdn.com/demo.html
+ 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be',
+ 'md5': '64a57396b16fa011b15e0ea60edce918',
'info_dict': {
- 'id': '0280d6b9-1215-11e6-b427-0cc47a188158',
+ 'id': '7201c779-6b3c-11e7-a40e-002590c750be',
'ext': 'mp4',
- 'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
+ 'title': 'Video Ads',
'is_live': False,
+ 'description': 'Video Ads Demo',
+ 'timestamp': 1500334803,
+ 'upload_date': '20170717',
+ 'duration': 888.032,
+ 'subtitles': {
+ 'eng': 'count:1',
+ },
},
- 'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'],
+ 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'],
}, {
# live video stream
- 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+ 'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be',
'info_dict': {
- 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
+ 'id': '66e68995-11ca-11e8-9273-002590c750be',
'ext': 'mp4',
- 'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'is_live': True,
},
'params': {
'skip_download': True, # m3u8 downloads
},
- 'expected_warnings': ['Failed to download MPD manifest'],
}, {
# live audio stream
'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
@@ -58,6 +67,14 @@ class ThreeQSDNIE(InfoExtractor):
# live video with rtmp link
'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be',
'only_matching': True,
+ }, {
+ # ondemand from http://www.philharmonie.tv/veranstaltung/26/
+ 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http',
+ 'only_matching': True,
+ }, {
+ # live video stream
+ 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
+ 'only_matching': True,
}]
@staticmethod
@@ -70,73 +87,78 @@ class ThreeQSDNIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- js = self._download_webpage(
- 'http://playout.3qsdn.com/%s' % video_id, video_id,
- query={'js': 'true'})
-
- if any(p in js for p in (
- '>This content is not available in your country',
- 'playout.3qsdn.com/forbidden')):
- self.raise_geo_restricted()
-
- stream_content = self._search_regex(
- r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js,
- 'stream content', default='demand', group='content')
+ try:
+ config = self._download_json(
+ url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ self.raise_geo_restricted()
+ raise
- live = stream_content == 'live'
-
- stream_type = self._search_regex(
- r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js,
- 'stream type', default='video', group='type')
+ live = config.get('streamContent') == 'live'
+ aspect = float_or_none(config.get('aspect'))
formats = []
- urls = set()
-
- def extract_formats(item_url, item={}):
- if not item_url or item_url in urls:
- return
- urls.add(item_url)
- ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
- if ext == 'mpd':
+ for source_type, source in (config.get('sources') or {}).items():
+ if not source:
+ continue
+ if source_type == 'dash':
formats.extend(self._extract_mpd_formats(
- item_url, video_id, mpd_id='mpd', fatal=False))
- elif ext == 'm3u8':
+ source, video_id, mpd_id='mpd', fatal=False))
+ elif source_type == 'hls':
formats.extend(self._extract_m3u8_formats(
- item_url, video_id, 'mp4',
- entry_protocol='m3u8' if live else 'm3u8_native',
+ source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
m3u8_id='hls', fatal=False))
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- item_url, video_id, f4m_id='hds', fatal=False))
- else:
- if not self._is_valid_url(item_url, video_id):
- return
- formats.append({
- 'url': item_url,
- 'format_id': item.get('quality'),
- 'ext': 'mp4' if item_url.startswith('rtsp') else ext,
- 'vcodec': 'none' if stream_type == 'audio' else None,
- })
-
- for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js):
- f = self._parse_json(
- item_js, video_id, transform_source=js_to_json, fatal=False)
- if not f:
+ elif source_type == 'progressive':
+ for s in source:
+ src = s.get('src')
+ if not (src and self._is_valid_url(src, video_id)):
+ continue
+ width = None
+ format_id = ['http']
+ ext = determine_ext(src)
+ if ext:
+ format_id.append(ext)
+ height = int_or_none(s.get('height'))
+ if height:
+ format_id.append('%dp' % height)
+ if aspect:
+ width = int(height * aspect)
+ formats.append({
+ 'ext': ext,
+ 'format_id': '-'.join(format_id),
+ 'height': height,
+ 'source_preference': 0,
+ 'url': src,
+ 'vcodec': 'none' if height == 0 else None,
+ 'width': width,
+ })
+ for f in formats:
+ if f.get('acodec') == 'none':
+ f['preference'] = -40
+ elif f.get('vcodec') == 'none':
+ f['preference'] = -50
+ self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id'))
+
+ subtitles = {}
+ for subtitle in (config.get('subtitles') or []):
+ src = subtitle.get('src')
+ if not src:
continue
- extract_formats(f.get('src'), f)
-
- # More relaxed version to collect additional URLs and acting
- # as a future-proof fallback
- for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js):
- extract_formats(src)
-
- self._sort_formats(formats)
+ subtitles.setdefault(subtitle.get('label') or 'eng', []).append({
+ 'url': src,
+ })
- title = self._live_title(video_id) if live else video_id
+ title = config.get('title') or video_id
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if live else title,
+ 'thumbnail': config.get('poster') or None,
+ 'description': config.get('description') or None,
+ 'timestamp': parse_iso8601(config.get('upload_date')),
+ 'duration': float_or_none(config.get('vlength')) or None,
'is_live': live,
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py
index ea1beb8af..4faa6de54 100644
--- a/youtube_dl/extractor/tiktok.py
+++ b/youtube_dl/extractor/tiktok.py
@@ -107,9 +107,12 @@ class TikTokIE(TikTokBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- data = self._parse_json(self._search_regex(
+ page_props = self._parse_json(self._search_regex(
r'<script[^>]+\bid=["\']__NEXT_DATA__[^>]+>\s*({.+?})\s*</script',
- webpage, 'data'), video_id)['props']['pageProps']['itemInfo']['itemStruct']
+ webpage, 'data'), video_id)['props']['pageProps']
+ data = try_get(page_props, lambda x: x['itemInfo']['itemStruct'], dict)
+ if not data and page_props.get('statusCode') == 10216:
+ raise ExtractorError('This video is private', expected=True)
return self._extract_video(data, video_id)
diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py
index 419f9d92e..3d1bf75ff 100644
--- a/youtube_dl/extractor/tmz.py
+++ b/youtube_dl/extractor/tmz.py
@@ -2,55 +2,110 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from .jwplatform import JWPlatformIE
+from .kaltura import KalturaIE
+from ..utils import (
+ int_or_none,
+ unified_timestamp,
+)
class TMZIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.tmz.com/videos/0_okj015ty/',
- 'md5': '4d22a51ef205b6c06395d8394f72d560',
+ 'url': 'http://www.tmz.com/videos/0-cegprt2p/',
+ 'md5': '31f9223e20eef55954973359afa61a20',
'info_dict': {
- 'id': '0_okj015ty',
+ 'id': 'P6YjLBLk',
'ext': 'mp4',
- 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!',
- 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?',
- 'timestamp': 1394747163,
- 'uploader_id': 'batchUser',
- 'upload_date': '20140313',
- }
+ 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet",
+ 'description': 'md5:b714359fc18607715ebccbd2da8ff488',
+ 'timestamp': 1467831837,
+ 'upload_date': '20160706',
+ },
+ 'add_ie': [JWPlatformIE.ie_key()],
}, {
- 'url': 'http://www.tmz.com/videos/0-cegprt2p/',
+ 'url': 'http://www.tmz.com/videos/0_okj015ty/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url).replace('-', '_')
- return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id)
+
+ webpage = self._download_webpage(url, video_id, fatal=False)
+ if webpage:
+ tmz_video_id = self._search_regex(
+ r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})',
+ webpage, 'video id', default=None)
+ video = self._download_json(
+ 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id,
+ fatal=False)
+ if video:
+ message = video['message']
+ info = {
+ '_type': 'url_transparent',
+ 'title': message.get('title'),
+ 'description': message.get('description'),
+ 'timestamp': unified_timestamp(message.get('published_at')),
+ 'duration': int_or_none(message.get('duration')),
+ }
+ jwplatform_id = message.get('jwplayer_media_id')
+ if jwplatform_id:
+ info.update({
+ 'url': 'jwplatform:%s' % jwplatform_id,
+ 'ie_key': JWPlatformIE.ie_key(),
+ })
+ else:
+ kaltura_entry_id = message.get('kaltura_entry_id') or video_id
+ kaltura_partner_id = message.get('kaltura_partner_id') or '591531'
+ info.update({
+ 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id),
+ 'ie_key': KalturaIE.ie_key(),
+ })
+ return info
+
+ return self.url_result(
+ 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id)
class TMZArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?'
+ _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert',
- 'md5': '3316ff838ae5bb7f642537825e1e90d2',
'info_dict': {
- 'id': '0_6snoelag',
- 'ext': 'mov',
+ 'id': 'PAKZa97W',
+ 'ext': 'mp4',
'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake',
'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."',
- 'timestamp': 1429467813,
+ 'timestamp': 1429466400,
'upload_date': '20150419',
- 'uploader_id': 'batchUser',
- }
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [JWPlatformIE.ie_key()],
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
+
+ tmz_url = self._search_regex(
+ r'clickLink\s*\(\s*["\'](?P<url>%s)' % TMZIE._VALID_URL, webpage,
+ 'video id', default=None, group='url')
+ if tmz_url:
+ return self.url_result(tmz_url, ie=TMZIE.ie_key())
+
embedded_video_info = self._parse_json(self._html_search_regex(
r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'),
video_id)
-
return self.url_result(
- 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'])
+ 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'],
+ ie=TMZIE.ie_key())
diff --git a/youtube_dl/extractor/trovo.py b/youtube_dl/extractor/trovo.py
new file mode 100644
index 000000000..de0107aa9
--- /dev/null
+++ b/youtube_dl/extractor/trovo.py
@@ -0,0 +1,194 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ str_or_none,
+ try_get,
+)
+
+
+class TrovoBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/'
+
+ def _extract_streamer_info(self, data):
+ streamer_info = data.get('streamerInfo') or {}
+ username = streamer_info.get('userName')
+ return {
+ 'uploader': streamer_info.get('nickName'),
+ 'uploader_id': str_or_none(streamer_info.get('uid')),
+ 'uploader_url': 'https://trovo.live/' + username if username else None,
+ }
+
+
+class TrovoIE(TrovoBaseIE):
+ _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P<id>[^/?&#]+)'
+
+ def _real_extract(self, url):
+ username = self._match_id(url)
+ live_info = self._download_json(
+ 'https://gql.trovo.live/', username, query={
+ 'query': '''{
+ getLiveInfo(params: {userName: "%s"}) {
+ isLive
+ programInfo {
+ coverUrl
+ id
+ streamInfo {
+ desc
+ playUrl
+ }
+ title
+ }
+ streamerInfo {
+ nickName
+ uid
+ userName
+ }
+ }
+}''' % username,
+ })['data']['getLiveInfo']
+ if live_info.get('isLive') == 0:
+ raise ExtractorError('%s is offline' % username, expected=True)
+ program_info = live_info['programInfo']
+ program_id = program_info['id']
+ title = self._live_title(program_info['title'])
+
+ formats = []
+ for stream_info in (program_info.get('streamInfo') or []):
+ play_url = stream_info.get('playUrl')
+ if not play_url:
+ continue
+ format_id = stream_info.get('desc')
+ formats.append({
+ 'format_id': format_id,
+ 'height': int_or_none(format_id[:-1]) if format_id else None,
+ 'url': play_url,
+ })
+ self._sort_formats(formats)
+
+ info = {
+ 'id': program_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': program_info.get('coverUrl'),
+ 'is_live': True,
+ }
+ info.update(self._extract_streamer_info(live_info))
+ return info
+
+
+class TrovoVodIE(TrovoBaseIE):
+ _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P<id>[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043',
+ 'info_dict': {
+ 'id': 'ltv-100095501_100095501_1609596043',
+ 'ext': 'mp4',
+ 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!',
+ 'uploader': 'Exsl',
+ 'timestamp': 1609640305,
+ 'upload_date': '20210103',
+ 'uploader_id': '100095501',
+ 'duration': 43977,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'comments': 'mincount:8',
+ 'categories': ['Grand Theft Auto V'],
+ },
+ }, {
+ 'url': 'https://trovo.live/clip/lc-5285890810184026005',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ vid = self._match_id(url)
+ resp = self._download_json(
+ 'https://gql.trovo.live/', vid, data=json.dumps([{
+ 'query': '''{
+ batchGetVodDetailInfo(params: {vids: ["%s"]}) {
+ VodDetailInfos
+ }
+}''' % vid,
+ }, {
+ 'query': '''{
+ getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) {
+ commentList {
+ author {
+ nickName
+ uid
+ }
+ commentID
+ content
+ createdAt
+ parentID
+ }
+ }
+}''' % vid,
+ }]).encode(), headers={
+ 'Content-Type': 'application/json',
+ })
+ vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid]
+ vod_info = vod_detail_info['vodInfo']
+ title = vod_info['title']
+
+ language = vod_info.get('languageName')
+ formats = []
+ for play_info in (vod_info.get('playInfos') or []):
+ play_url = play_info.get('playUrl')
+ if not play_url:
+ continue
+ format_id = play_info.get('desc')
+ formats.append({
+ 'ext': 'mp4',
+ 'filesize': int_or_none(play_info.get('fileSize')),
+ 'format_id': format_id,
+ 'height': int_or_none(format_id[:-1]) if format_id else None,
+ 'language': language,
+ 'protocol': 'm3u8_native',
+ 'tbr': int_or_none(play_info.get('bitrate')),
+ 'url': play_url,
+ 'http_headers': {'Origin': 'https://trovo.live'},
+ })
+ self._sort_formats(formats)
+
+ category = vod_info.get('categoryName')
+ get_count = lambda x: int_or_none(vod_info.get(x + 'Num'))
+
+ comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or []
+ comments = []
+ for comment in comment_list:
+ content = comment.get('content')
+ if not content:
+ continue
+ author = comment.get('author') or {}
+ parent = comment.get('parentID')
+ comments.append({
+ 'author': author.get('nickName'),
+ 'author_id': str_or_none(author.get('uid')),
+ 'id': str_or_none(comment.get('commentID')),
+ 'text': content,
+ 'timestamp': int_or_none(comment.get('createdAt')),
+ 'parent': 'root' if parent == 0 else str_or_none(parent),
+ })
+
+ info = {
+ 'id': vid,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': vod_info.get('coverUrl'),
+ 'timestamp': int_or_none(vod_info.get('publishTs')),
+ 'duration': int_or_none(vod_info.get('duration')),
+ 'view_count': get_count('watch'),
+ 'like_count': get_count('like'),
+ 'comment_count': get_count('comment'),
+ 'comments': comments,
+ 'categories': [category] if category else None,
+ }
+ info.update(self._extract_streamer_info(vod_detail_info))
+ return info
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
index 4a19b9be6..334b7d540 100644
--- a/youtube_dl/extractor/tv2.py
+++ b/youtube_dl/extractor/tv2.py
@@ -20,7 +20,7 @@ from ..utils import (
class TV2IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.tv2.no/v/916509/',
'info_dict': {
'id': '916509',
@@ -33,7 +33,7 @@ class TV2IE(InfoExtractor):
'view_count': int,
'categories': list,
},
- }
+ }]
_API_DOMAIN = 'sumo.tv2.no'
_PROTOCOLS = ('HDS', 'HLS', 'DASH')
_GEO_COUNTRIES = ['NO']
@@ -42,6 +42,12 @@ class TV2IE(InfoExtractor):
video_id = self._match_id(url)
api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id)
+ asset = self._download_json(
+ api_base + '.json', video_id,
+ 'Downloading metadata JSON')['asset']
+ title = asset.get('subtitle') or asset['title']
+ is_live = asset.get('live') is True
+
formats = []
format_urls = []
for protocol in self._PROTOCOLS:
@@ -81,7 +87,8 @@ class TV2IE(InfoExtractor):
elif ext == 'm3u8':
if not data.get('drmProtected'):
formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ video_url, video_id, 'mp4',
+ 'm3u8' if is_live else 'm3u8_native',
m3u8_id=format_id, fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
@@ -99,11 +106,6 @@ class TV2IE(InfoExtractor):
raise ExtractorError('This video is DRM protected.', expected=True)
self._sort_formats(formats)
- asset = self._download_json(
- api_base + '.json', video_id,
- 'Downloading metadata JSON')['asset']
- title = asset['title']
-
thumbnails = [{
'id': thumbnail.get('@type'),
'url': thumbnail.get('url'),
@@ -112,7 +114,7 @@ class TV2IE(InfoExtractor):
return {
'id': video_id,
'url': video_url,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'description': strip_or_none(asset.get('description')),
'thumbnails': thumbnails,
'timestamp': parse_iso8601(asset.get('createTime')),
@@ -120,6 +122,7 @@ class TV2IE(InfoExtractor):
'view_count': int_or_none(asset.get('views')),
'categories': asset.get('keywords', '').split(','),
'formats': formats,
+ 'is_live': is_live,
}
@@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor):
class KatsomoIE(TV2IE):
- _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321',
'info_dict': {
'id': '1181321',
'ext': 'mp4',
- 'title': 'MTV Uutiset Live',
+ 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle',
'description': 'Päätöksen teki Pelicansin hallitus.',
'timestamp': 1575116484,
'upload_date': '20191130',
@@ -186,7 +189,60 @@ class KatsomoIE(TV2IE):
# m3u8 download
'skip_download': True,
},
- }
+ }, {
+ 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.mtvuutiset.fi/video/prog1311159',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.katsomo.fi/#!/jakso/1311159',
+ 'only_matching': True,
+ }]
_API_DOMAIN = 'api.katsomo.fi'
_PROTOCOLS = ('HLS', 'MPD')
_GEO_COUNTRIES = ['FI']
+
+
+class MTVUutisetArticleIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384',
+ 'info_dict': {
+ 'id': '1311159',
+ 'ext': 'mp4',
+ 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla',
+ 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla',
+ 'timestamp': 1600608966,
+ 'upload_date': '20200920',
+ 'duration': 153.7886666,
+ 'view_count': int,
+ 'categories': list,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ # multiple Youtube embeds
+ 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ article = self._download_json(
+ 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id,
+ article_id)
+
+ def entries():
+ for video in (article.get('videos') or []):
+ video_type = video.get('videotype')
+ video_url = video.get('url')
+ if not (video_url and video_type in ('katsomo', 'youtube')):
+ continue
+ yield self.url_result(
+ video_url, video_type.capitalize(), video.get('video_id'))
+
+ return self.playlist_result(
+ entries(), article_id, article.get('title'), article.get('description'))
diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py
index 8bda9348d..8bd5fd640 100644
--- a/youtube_dl/extractor/tv2dk.py
+++ b/youtube_dl/extractor/tv2dk.py
@@ -74,6 +74,12 @@ class TV2DKIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
entries = []
+
+ def add_entry(partner_id, kaltura_id):
+ entries.append(self.url_result(
+ 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura',
+ video_id=kaltura_id))
+
for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage):
video = extract_attributes(video_el)
kaltura_id = video.get('data-entryid')
@@ -82,9 +88,14 @@ class TV2DKIE(InfoExtractor):
partner_id = video.get('data-partnerid')
if not partner_id:
continue
- entries.append(self.url_result(
- 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura',
- video_id=kaltura_id))
+ add_entry(partner_id, kaltura_id)
+ if not entries:
+ kaltura_id = self._search_regex(
+ r'entry_id\s*:\s*["\']([0-9a-z_]+)', webpage, 'kaltura id')
+ partner_id = self._search_regex(
+ (r'\\u002Fp\\u002F(\d+)\\u002F', r'/p/(\d+)/'), webpage,
+ 'partner id')
+ add_entry(partner_id, kaltura_id)
return self.playlist_result(entries)
diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py
index c498b0191..b73bab9a8 100644
--- a/youtube_dl/extractor/tv4.py
+++ b/youtube_dl/extractor/tv4.py
@@ -17,7 +17,7 @@ class TV4IE(InfoExtractor):
tv4\.se/(?:[^/]+)/klipp/(?:.*)-|
tv4play\.se/
(?:
- (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)|
+ (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)|
iframe/video/|
film/|
sport/|
@@ -65,6 +65,10 @@ class TV4IE(InfoExtractor):
{
'url': 'http://www.tv4play.se/program/farang/3922081',
'only_matching': True,
+ },
+ {
+ 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940',
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/tver.py b/youtube_dl/extractor/tver.py
index 931d4d650..a4a30b1e6 100644
--- a/youtube_dl/extractor/tver.py
+++ b/youtube_dl/extractor/tver.py
@@ -25,6 +25,10 @@ class TVerIE(InfoExtractor):
}, {
'url': 'https://tver.jp/episode/79622438',
'only_matching': True,
+ }, {
+ # subtitle = ' '
+ 'url': 'https://tver.jp/corner/f0068870',
+ 'only_matching': True,
}]
_TOKEN = None
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
@@ -40,28 +44,18 @@ class TVerIE(InfoExtractor):
query={'token': self._TOKEN})['main']
p_id = main['publisher_id']
service = remove_start(main['service'], 'ts_')
- info = {
+
+ r_id = main['reference_id']
+ if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'):
+ r_id = 'ref:' + r_id
+ bc_url = smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id),
+ {'geo_countries': ['JP']})
+
+ return {
'_type': 'url_transparent',
'description': try_get(main, lambda x: x['note'][0]['text'], compat_str),
'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])),
+ 'url': bc_url,
+ 'ie_key': 'BrightcoveNew',
}
-
- if service == 'cx':
- info.update({
- 'title': main.get('subtitle') or main['title'],
- 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id),
- 'ie_key': 'FujiTVFODPlus7',
- })
- else:
- r_id = main['reference_id']
- if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'):
- r_id = 'ref:' + r_id
- bc_url = smuggle_url(
- self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id),
- {'geo_countries': ['JP']})
- info.update({
- 'url': bc_url,
- 'ie_key': 'BrightcoveNew',
- })
-
- return info
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index db264e8a1..a378bd6dc 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -17,6 +17,7 @@ from ..compat import (
)
from ..utils import (
clean_html,
+ dict_get,
ExtractorError,
float_or_none,
int_or_none,
@@ -48,6 +49,7 @@ class TwitchBaseIE(InfoExtractor):
'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84',
'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e',
'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01',
+ 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11',
'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687',
}
@@ -76,14 +78,14 @@ class TwitchBaseIE(InfoExtractor):
headers = {
'Referer': page_url,
- 'Origin': page_url,
+ 'Origin': 'https://www.twitch.tv',
'Content-Type': 'text/plain;charset=UTF-8',
}
response = self._download_json(
post_url, None, note, data=json.dumps(form).encode(),
headers=headers, expected_status=400)
- error = response.get('error_description') or response.get('error_code')
+ error = dict_get(response, ('error', 'error_description', 'error_code'))
if error:
fail(error)
@@ -137,13 +139,17 @@ class TwitchBaseIE(InfoExtractor):
self._sort_formats(formats)
def _download_base_gql(self, video_id, ops, note, fatal=True):
+ headers = {
+ 'Content-Type': 'text/plain;charset=UTF-8',
+ 'Client-ID': self._CLIENT_ID,
+ }
+ gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token')
+ if gql_auth:
+ headers['Authorization'] = 'OAuth ' + gql_auth.value
return self._download_json(
'https://gql.twitch.tv/gql', video_id, note,
data=json.dumps(ops).encode(),
- headers={
- 'Content-Type': 'text/plain;charset=UTF-8',
- 'Client-ID': self._CLIENT_ID,
- }, fatal=fatal)
+ headers=headers, fatal=fatal)
def _download_gql(self, video_id, ops, note, fatal=True):
for op in ops:
@@ -888,7 +894,25 @@ class TwitchClipsIE(TwitchBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- clip = self._download_base_gql(
+ clip = self._download_gql(
+ video_id, [{
+ 'operationName': 'VideoAccessToken_Clip',
+ 'variables': {
+ 'slug': video_id,
+ },
+ }],
+ 'Downloading clip access token GraphQL')[0]['data']['clip']
+
+ if not clip:
+ raise ExtractorError(
+ 'This clip is no longer available', expected=True)
+
+ access_query = {
+ 'sig': clip['playbackAccessToken']['signature'],
+ 'token': clip['playbackAccessToken']['value'],
+ }
+
+ data = self._download_base_gql(
video_id, {
'query': '''{
clip(slug: "%s") {
@@ -913,11 +937,10 @@ class TwitchClipsIE(TwitchBaseIE):
}
viewCount
}
-}''' % video_id}, 'Downloading clip GraphQL')['data']['clip']
+}''' % video_id}, 'Downloading clip GraphQL', fatal=False)
- if not clip:
- raise ExtractorError(
- 'This clip is no longer available', expected=True)
+ if data:
+ clip = try_get(data, lambda x: x['data']['clip'], dict) or clip
formats = []
for option in clip.get('videoQualities', []):
@@ -927,7 +950,7 @@ class TwitchClipsIE(TwitchBaseIE):
if not source:
continue
formats.append({
- 'url': source,
+ 'url': update_url_query(source, access_query),
'format_id': option.get('quality'),
'height': int_or_none(option.get('quality')),
'fps': int_or_none(option.get('frameRate')),
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index 1190d721e..cfa7a7326 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -19,6 +19,7 @@ from ..utils import (
strip_or_none,
unified_timestamp,
update_url_query,
+ url_or_none,
xpath_text,
)
@@ -52,6 +53,9 @@ class TwitterBaseIE(InfoExtractor):
return [f]
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
+ vmap_url = url_or_none(vmap_url)
+ if not vmap_url:
+ return []
vmap_data = self._download_xml(vmap_url, video_id)
formats = []
urls = []
@@ -374,6 +378,24 @@ class TwitterIE(TwitterBaseIE):
},
'add_ie': ['TwitterBroadcast'],
}, {
+ # unified card
+ 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20',
+ 'info_dict': {
+ 'id': '1349794411333394432',
+ 'ext': 'mp4',
+ 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'description': 'md5:71ead15ec44cee55071547d6447c6a3e',
+ 'uploader': 'Brooklyn Nets',
+ 'uploader_id': 'BrooklynNets',
+ 'duration': 324.484,
+ 'timestamp': 1610651040,
+ 'upload_date': '20210114',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
# Twitch Clip Embed
'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
'only_matching': True,
@@ -389,6 +411,22 @@ class TwitterIE(TwitterBaseIE):
# appplayer card
'url': 'https://twitter.com/poco_dandy/status/1150646424461176832',
'only_matching': True,
+ }, {
+ # video_direct_message card
+ 'url': 'https://twitter.com/qarev001/status/1348948114569269251',
+ 'only_matching': True,
+ }, {
+ # poll2choice_video card
+ 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585',
+ 'only_matching': True,
+ }, {
+ # poll3choice_video card
+ 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984',
+ 'only_matching': True,
+ }, {
+ # poll4choice_video card
+ 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -433,8 +471,7 @@ class TwitterIE(TwitterBaseIE):
'tags': tags,
}
- media = try_get(status, lambda x: x['extended_entities']['media'][0])
- if media and media.get('type') != 'photo':
+ def extract_from_video_info(media):
video_info = media.get('video_info') or {}
formats = []
@@ -461,6 +498,10 @@ class TwitterIE(TwitterBaseIE):
'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
+
+ media = try_get(status, lambda x: x['extended_entities']['media'][0])
+ if media and media.get('type') != 'photo':
+ extract_from_video_info(media)
else:
card = status.get('card')
if card:
@@ -493,7 +534,12 @@ class TwitterIE(TwitterBaseIE):
'_type': 'url',
'url': get_binding_value('card_url'),
})
- # amplify, promo_video_website, promo_video_convo, appplayer, ...
+ elif card_name == 'unified_card':
+ media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities']
+ extract_from_video_info(next(iter(media_entities.values())))
+ # amplify, promo_video_website, promo_video_convo, appplayer,
+ # video_direct_message, poll2choice_video, poll3choice_video,
+ # poll4choice_video, ...
else:
is_amplify = card_name == 'amplify'
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
diff --git a/youtube_dl/extractor/umg.py b/youtube_dl/extractor/umg.py
index d815cd9a6..47948b6ce 100644
--- a/youtube_dl/extractor/umg.py
+++ b/youtube_dl/extractor/umg.py
@@ -28,7 +28,7 @@ class UMGDeIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
- 'https://api.universal-music.de/graphql',
+ 'https://graphql.universal-music.de/',
video_id, query={
'query': '''{
universalMusic(channel:16) {
@@ -56,11 +56,9 @@ class UMGDeIE(InfoExtractor):
formats = []
def add_m3u8_format(format_id):
- m3u8_formats = self._extract_m3u8_formats(
+ formats.extend(self._extract_m3u8_formats(
hls_url_template % format_id, video_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal='False')
- if m3u8_formats and m3u8_formats[0].get('height'):
- formats.extend(m3u8_formats)
+ 'm3u8_native', m3u8_id='hls', fatal=False))
for f in video_data.get('formats', []):
f_url = f.get('url')
diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py
index 10b817760..d6c79147e 100644
--- a/youtube_dl/extractor/urplay.py
+++ b/youtube_dl/extractor/urplay.py
@@ -21,6 +21,11 @@ class URPlayIE(InfoExtractor):
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
'timestamp': 1513292400,
'upload_date': '20171214',
+ 'series': 'UR Samtiden - Livet, universum och rymdens märkliga musik',
+ 'duration': 2269,
+ 'categories': ['Kultur & historia'],
+ 'tags': ['Kritiskt tänkande', 'Vetenskap', 'Vetenskaplig verksamhet'],
+ 'episode': 'Om vetenskap, kritiskt tänkande och motstånd',
},
}, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
@@ -31,6 +36,10 @@ class URPlayIE(InfoExtractor):
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
'timestamp': 1440086400,
'upload_date': '20150820',
+ 'series': 'Tripp, Trapp, Träd',
+ 'duration': 865,
+ 'tags': ['Sova'],
+ 'episode': 'Sovkudde',
},
}, {
'url': 'http://urskola.se/Produkter/155794-Smasagor-meankieli-Grodan-i-vida-varlden',
@@ -41,9 +50,11 @@ class URPlayIE(InfoExtractor):
video_id = self._match_id(url)
url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
- urplayer_data = self._parse_json(self._html_search_regex(
- r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"',
- webpage, 'urplayer data'), video_id)['currentProduct']
+ vid = int(video_id)
+ accessible_episodes = self._parse_json(self._html_search_regex(
+ r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"',
+ webpage, 'urplayer data'), video_id)['accessibleEpisodes']
+ urplayer_data = next(e for e in accessible_episodes if e.get('id') == vid)
episode = urplayer_data['title']
raw_streaming_info = urplayer_data['streamingInfo']['raw']
host = self._download_json(
diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py
index 9e860aeb7..1e29cbe22 100644
--- a/youtube_dl/extractor/ustream.py
+++ b/youtube_dl/extractor/ustream.py
@@ -75,7 +75,7 @@ class UstreamIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
if mobj is not None:
return mobj.group('url')
diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py
index fe7a26b62..22e99e8f0 100644
--- a/youtube_dl/extractor/vgtv.py
+++ b/youtube_dl/extractor/vgtv.py
@@ -23,6 +23,8 @@ class VGTVIE(XstreamIE):
'fvn.no/fvntv': 'fvntv',
'aftenposten.no/webtv': 'aptv',
'ap.vgtv.no/webtv': 'aptv',
+ 'tv.aftonbladet.se': 'abtv',
+ # obsolete URL schemas, kept in order to save one HTTP redirect
'tv.aftonbladet.se/abtv': 'abtv',
'www.aftonbladet.se/tv': 'abtv',
}
@@ -141,6 +143,10 @@ class VGTVIE(XstreamIE):
'only_matching': True,
},
{
+ 'url': 'https://tv.aftonbladet.se/video/36015/vulkanutbrott-i-rymden-nu-slapper-nasa-bilderna',
+ 'only_matching': True,
+ },
+ {
'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'only_matching': True,
},
diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py
index e5f964d39..6376ff096 100644
--- a/youtube_dl/extractor/videopress.py
+++ b/youtube_dl/extractor/videopress.py
@@ -4,21 +4,22 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
float_or_none,
+ int_or_none,
parse_age_limit,
qualities,
random_birthday,
- try_get,
unified_timestamp,
urljoin,
)
class VideoPressIE(InfoExtractor):
- _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)'
+ _ID_REGEX = r'[\da-zA-Z]{8}'
+ _PATH_REGEX = r'video(?:\.word)?press\.com/embed/'
+ _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX)
_TESTS = [{
'url': 'https://videopress.com/embed/kUJmAcSf',
'md5': '706956a6c875873d51010921310e4bc6',
@@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor):
# 17+, requires birth_* params
'url': 'https://videopress.com/embed/iH3gstfZ',
'only_matching': True,
+ }, {
+ 'url': 'https://video.wordpress.com/embed/kUJmAcSf',
+ 'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
- r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)',
+ r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX),
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
query = random_birthday('birth_year', 'birth_month', 'birth_day')
+ query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width'
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
video_id, query=query)
title = video['title']
- def base_url(scheme):
- return try_get(
- video, lambda x: x['file_url_base'][scheme], compat_str)
-
- base_url = base_url('https') or base_url('http')
+ file_url_base = video.get('file_url_base') or {}
+ base_url = file_url_base.get('https') or file_url_base.get('http')
QUALITIES = ('std', 'dvd', 'hd')
quality = qualities(QUALITIES)
formats = []
- for format_id, f in video['files'].items():
+ for format_id, f in (video.get('files') or {}).items():
if not isinstance(f, dict):
continue
for ext, path in f.items():
@@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor):
'ext': determine_ext(path, ext),
'quality': quality(format_id),
})
- original_url = try_get(video, lambda x: x['original'], compat_str)
+ original_url = video.get('original')
if original_url:
formats.append({
'url': original_url,
'format_id': 'original',
'quality': len(QUALITIES),
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
})
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
index b48baf00b..b1243e847 100644
--- a/youtube_dl/extractor/vidio.py
+++ b/youtube_dl/extractor/vidio.py
@@ -4,7 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ str_or_none,
+ strip_or_none,
+ try_get,
+)
class VidioIE(InfoExtractor):
@@ -21,57 +27,63 @@ class VidioIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 149,
'like_count': int,
+ 'uploader': 'TWELVE Pic',
+ 'timestamp': 1444902800,
+ 'upload_date': '20151015',
+ 'uploader_id': 'twelvepictures',
+ 'channel': 'Cover Music Video',
+ 'channel_id': '280236',
+ 'view_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'tags': 'count:4',
},
}, {
'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north',
'only_matching': True,
}]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id, display_id = mobj.group('id', 'display_id')
-
- webpage = self._download_webpage(url, display_id)
-
- title = self._og_search_title(webpage)
+ def _real_initialize(self):
+ self._api_key = self._download_json(
+ 'https://www.vidio.com/auth', None, data=b'')['api_key']
- m3u8_url, duration, thumbnail = [None] * 3
-
- clips = self._parse_json(
- self._html_search_regex(
- r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1',
- webpage, 'video data', default='[]', group='data'),
- display_id, fatal=False)
- if clips:
- clip = clips[0]
- m3u8_url = clip.get('sources', [{}])[0].get('file')
- duration = clip.get('clip_duration')
- thumbnail = clip.get('image')
+ def _real_extract(self, url):
+ video_id, display_id = re.match(self._VALID_URL, url).groups()
+ data = self._download_json(
+ 'https://api.vidio.com/videos/' + video_id, display_id, headers={
+ 'Content-Type': 'application/vnd.api+json',
+ 'X-API-KEY': self._api_key,
+ })
+ video = data['videos'][0]
+ title = video['title'].strip()
- m3u8_url = m3u8_url or self._search_regex(
- r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
- webpage, 'hls url', group='url')
formats = self._extract_m3u8_formats(
- m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
- duration = int_or_none(duration or self._search_regex(
- r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage,
- 'duration', fatal=False, group='duration'))
- thumbnail = thumbnail or self._og_search_thumbnail(webpage)
-
- like_count = int_or_none(self._search_regex(
- (r'<span[^>]+data-comment-vote-count=["\'](\d+)',
- r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'),
- webpage, 'like count', fatal=False))
+ get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {}
+ channel = get_first('channel')
+ user = get_first('user')
+ username = user.get('username')
+ get_count = lambda x: int_or_none(video.get('total_' + x))
return {
'id': video_id,
'display_id': display_id,
'title': title,
- 'description': self._og_search_description(webpage),
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'like_count': like_count,
+ 'description': strip_or_none(video.get('description')),
+ 'thumbnail': video.get('image_url_medium'),
+ 'duration': int_or_none(video.get('duration')),
+ 'like_count': get_count('likes'),
'formats': formats,
+ 'uploader': user.get('name'),
+ 'timestamp': parse_iso8601(video.get('created_at')),
+ 'uploader_id': username,
+ 'uploader_url': 'https://www.vidio.com/@' + username if username else None,
+ 'channel': channel.get('name'),
+ 'channel_id': str_or_none(channel.get('id')),
+ 'view_count': get_count('view_count'),
+ 'dislike_count': get_count('dislikes'),
+ 'comment_count': get_count('comments'),
+ 'tags': video.get('tag_list'),
}
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
deleted file mode 100644
index 42ea4952c..000000000
--- a/youtube_dl/extractor/vidzi.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- decode_packed_codes,
- js_to_json,
- NO_DEFAULT,
- PACKED_CODES_RE,
-)
-
-
-class VidziIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
- _TESTS = [{
- 'url': 'http://vidzi.tv/cghql9yq6emu.html',
- 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
- 'info_dict': {
- 'id': 'cghql9yq6emu',
- 'ext': 'mp4',
- 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- }, {
- 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html',
- 'only_matching': True,
- }, {
- 'url': 'http://vidzi.cc/cghql9yq6emu.html',
- 'only_matching': True,
- }, {
- 'url': 'https://vidzi.si/rph9gztxj1et.html',
- 'only_matching': True,
- }, {
- 'url': 'http://vidzi.nu/cghql9yq6emu.html',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(
- 'http://vidzi.tv/%s' % video_id, video_id)
- title = self._html_search_regex(
- r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
-
- codes = [webpage]
- codes.extend([
- decode_packed_codes(mobj.group(0)).replace('\\\'', '\'')
- for mobj in re.finditer(PACKED_CODES_RE, webpage)])
- for num, code in enumerate(codes, 1):
- jwplayer_data = self._parse_json(
- self._search_regex(
- r'setup\(([^)]+)\)', code, 'jwplayer data',
- default=NO_DEFAULT if num == len(codes) else '{}'),
- video_id, transform_source=lambda s: js_to_json(
- re.sub(r'\s*\+\s*window\[.+?\]', '', s)))
- if jwplayer_data:
- break
-
- info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False)
- info_dict['title'] = title
-
- return info_dict
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index a311f21ef..2e9cbf148 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -21,6 +21,7 @@ from ..utils import (
parse_iso8601,
sanitized_Request,
std_headers,
+ try_get,
)
@@ -30,7 +31,7 @@ class VikiBaseIE(InfoExtractor):
_API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s'
_APP = '100005a'
- _APP_VERSION = '2.2.5.1428709186'
+ _APP_VERSION = '6.0.0'
_APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad'
_GEO_BYPASS = False
@@ -41,7 +42,7 @@ class VikiBaseIE(InfoExtractor):
_ERRORS = {
'geo': 'Sorry, this content is not available in your region.',
'upcoming': 'Sorry, this content is not yet available.',
- # 'paywall': 'paywall',
+ 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
}
def _prepare_call(self, path, timestamp=None, post_data=None):
@@ -62,7 +63,8 @@ class VikiBaseIE(InfoExtractor):
def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
resp = self._download_json(
- self._prepare_call(path, timestamp, post_data), video_id, note)
+ self._prepare_call(path, timestamp, post_data), video_id, note,
+ headers={'x-viki-app-ver': self._APP_VERSION})
error = resp.get('error')
if error:
@@ -82,11 +84,13 @@ class VikiBaseIE(InfoExtractor):
expected=True)
def _check_errors(self, data):
- for reason, status in data.get('blocking', {}).items():
+ for reason, status in (data.get('blocking') or {}).items():
if status and reason in self._ERRORS:
message = self._ERRORS[reason]
if reason == 'geo':
self.raise_geo_restricted(msg=message)
+ elif reason == 'paywall':
+ self.raise_login_required(message)
raise ExtractorError('%s said: %s' % (
self.IE_NAME, message), expected=True)
@@ -131,13 +135,19 @@ class VikiIE(VikiBaseIE):
'info_dict': {
'id': '1023585v',
'ext': 'mp4',
- 'title': 'Heirs Episode 14',
- 'uploader': 'SBS',
- 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ 'title': 'Heirs - Episode 14',
+ 'uploader': 'SBS Contents Hub',
+ 'timestamp': 1385047627,
'upload_date': '20131121',
'age_limit': 13,
+ 'duration': 3570,
+ 'episode_number': 14,
+ },
+ 'params': {
+ 'format': 'bestvideo',
},
'skip': 'Blocked in the US',
+ 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
@@ -153,7 +163,8 @@ class VikiIE(VikiBaseIE):
'uploader': 'Arirang TV',
'like_count': int,
'age_limit': 0,
- }
+ },
+ 'skip': 'Sorry. There was an error loading this video',
}, {
'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
'info_dict': {
@@ -171,7 +182,7 @@ class VikiIE(VikiBaseIE):
}, {
# episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
- 'md5': '94e0e34fd58f169f40c184f232356cfe',
+ 'md5': '0a53dc252e6e690feccd756861495a8c',
'info_dict': {
'id': '44699v',
'ext': 'mp4',
@@ -183,6 +194,10 @@ class VikiIE(VikiBaseIE):
'uploader': 'group8',
'like_count': int,
'age_limit': 13,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'format': 'bestvideo',
},
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
@@ -209,7 +224,7 @@ class VikiIE(VikiBaseIE):
}, {
# non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic',
- 'md5': 'adf9e321a0ae5d0aace349efaaff7691',
+ 'md5': '41faaba0de90483fb4848952af7c7d0d',
'info_dict': {
'id': '158036v',
'ext': 'mp4',
@@ -220,6 +235,10 @@ class VikiIE(VikiBaseIE):
'title': 'Love In Magic',
'age_limit': 13,
},
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}]
def _real_extract(self, url):
@@ -229,36 +248,33 @@ class VikiIE(VikiBaseIE):
'https://www.viki.com/api/videos/' + video_id,
video_id, 'Downloading video JSON', headers={
'x-client-user-agent': std_headers['User-Agent'],
- 'x-viki-app-ver': '4.0.57',
+ 'x-viki-app-ver': '3.0.0',
})
video = resp['video']
self._check_errors(video)
title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
+ episode_number = int_or_none(video.get('number'))
if not title:
- title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
- container_titles = video.get('container', {}).get('titles', {})
+ title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
+ container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {}
container_title = self.dict_selection(container_titles, 'en')
title = '%s - %s' % (container_title, title)
description = self.dict_selection(video.get('descriptions', {}), 'en')
- duration = int_or_none(video.get('duration'))
- timestamp = parse_iso8601(video.get('created_at'))
- uploader = video.get('author')
- like_count = int_or_none(video.get('likes', {}).get('count'))
- age_limit = parse_age_limit(video.get('rating'))
+ like_count = int_or_none(try_get(video, lambda x: x['likes']['count']))
thumbnails = []
- for thumbnail_id, thumbnail in video.get('images', {}).items():
+ for thumbnail_id, thumbnail in (video.get('images') or {}).items():
thumbnails.append({
'id': thumbnail_id,
'url': thumbnail.get('url'),
})
subtitles = {}
- for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+ for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items():
subtitles[subtitle_lang] = [{
'ext': subtitles_format,
'url': self._prepare_call(
@@ -269,13 +285,15 @@ class VikiIE(VikiBaseIE):
'id': video_id,
'title': title,
'description': description,
- 'duration': duration,
- 'timestamp': timestamp,
- 'uploader': uploader,
+ 'duration': int_or_none(video.get('duration')),
+ 'timestamp': parse_iso8601(video.get('created_at')),
+ 'uploader': video.get('author'),
+ 'uploader_url': video.get('author_url'),
'like_count': like_count,
- 'age_limit': age_limit,
+ 'age_limit': parse_age_limit(video.get('rating')),
'thumbnails': thumbnails,
'subtitles': subtitles,
+ 'episode_number': episode_number,
}
formats = []
@@ -360,7 +378,7 @@ class VikiChannelIE(VikiBaseIE):
'info_dict': {
'id': '50c',
'title': 'Boys Over Flowers',
- 'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+ 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
},
'playlist_mincount': 71,
}, {
@@ -371,6 +389,7 @@ class VikiChannelIE(VikiBaseIE):
'description': 'md5:05bf5471385aa8b21c18ad450e350525',
},
'playlist_count': 127,
+ 'skip': 'Page not found',
}, {
'url': 'http://www.viki.com/news/24569c-showbiz-korea',
'only_matching': True,
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 15cd06268..0b386f450 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import base64
import functools
-import json
import re
import itertools
@@ -17,14 +16,14 @@ from ..compat import (
from ..utils import (
clean_html,
determine_ext,
- dict_get,
ExtractorError,
+ get_element_by_class,
js_to_json,
int_or_none,
merge_dicts,
OnDemandPagedList,
parse_filesize,
- RegexNotFoundError,
+ parse_iso8601,
sanitized_Request,
smuggle_url,
std_headers,
@@ -74,25 +73,28 @@ class VimeoBaseInfoExtractor(InfoExtractor):
expected=True)
raise ExtractorError('Unable to log in')
- def _verify_video_password(self, url, video_id, webpage):
+ def _get_video_password(self):
password = self._downloader.params.get('videopassword')
if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
- token, vuid = self._extract_xsrft_and_vuid(webpage)
- data = urlencode_postdata({
- 'password': password,
- 'token': token,
- })
+ raise ExtractorError(
+ 'This video is protected by a password, use the --video-password option',
+ expected=True)
+ return password
+
+ def _verify_video_password(self, url, video_id, password, token, vuid):
if url.startswith('http://'):
# vimeo only supports https now, but the user can give an http url
url = url.replace('http://', 'https://')
- password_request = sanitized_Request(url + '/password', data)
- password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
- password_request.add_header('Referer', url)
self._set_vimeo_cookie('vuid', vuid)
return self._download_webpage(
- password_request, video_id,
- 'Verifying the password', 'Wrong password')
+ url + '/password', video_id, 'Verifying the password',
+ 'Wrong password', data=urlencode_postdata({
+ 'password': password,
+ 'token': token,
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': url,
+ })
def _extract_xsrft_and_vuid(self, webpage):
xsrft = self._search_regex(
@@ -123,10 +125,11 @@ class VimeoBaseInfoExtractor(InfoExtractor):
video_title = video_data['title']
live_event = video_data.get('live_event') or {}
is_live = live_event.get('status') == 'started'
+ request = config.get('request') or {}
formats = []
- config_files = video_data.get('files') or config['request'].get('files', {})
- for f in config_files.get('progressive', []):
+ config_files = video_data.get('files') or request.get('files') or {}
+ for f in (config_files.get('progressive') or []):
video_url = f.get('url')
if not video_url:
continue
@@ -142,7 +145,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
# TODO: fix handling of 308 status code returned for live archive manifest requests
sep_pattern = r'/sep/video/'
for files_type in ('hls', 'dash'):
- for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
+ for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items():
manifest_url = cdn_data.get('url')
if not manifest_url:
continue
@@ -188,17 +191,15 @@ class VimeoBaseInfoExtractor(InfoExtractor):
f['preference'] = -40
subtitles = {}
- text_tracks = config['request'].get('text_tracks')
- if text_tracks:
- for tt in text_tracks:
- subtitles[tt['lang']] = [{
- 'ext': 'vtt',
- 'url': urljoin('https://vimeo.com', tt['url']),
- }]
+ for tt in (request.get('text_tracks') or []):
+ subtitles[tt['lang']] = [{
+ 'ext': 'vtt',
+ 'url': urljoin('https://vimeo.com', tt['url']),
+ }]
thumbnails = []
if not is_live:
- for key, thumb in video_data.get('thumbs', {}).items():
+ for key, thumb in (video_data.get('thumbs') or {}).items():
thumbnails.append({
'id': key,
'width': int_or_none(key),
@@ -226,10 +227,12 @@ class VimeoBaseInfoExtractor(InfoExtractor):
'is_live': is_live,
}
- def _extract_original_format(self, url, video_id):
+ def _extract_original_format(self, url, video_id, unlisted_hash=None):
+ query = {'action': 'load_download_config'}
+ if unlisted_hash:
+ query['unlisted_hash'] = unlisted_hash
download_data = self._download_json(
- url, video_id, fatal=False,
- query={'action': 'load_download_config'},
+ url, video_id, fatal=False, query=query,
headers={'X-Requested-With': 'XMLHttpRequest'})
if download_data:
source_file = download_data.get('source_file')
@@ -276,7 +279,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
)?
(?:videos?/)?
(?P<id>[0-9]+)
- (?:/[\da-f]+)?
+ (?:/(?P<unlisted_hash>[\da-f]{10}))?
/?(?:[?&].*)?(?:[#].*)?$
'''
IE_NAME = 'vimeo'
@@ -316,6 +319,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 1595,
'upload_date': '20130610',
'timestamp': 1370893156,
+ 'license': 'by',
},
'params': {
'format': 'best[protocol=https]',
@@ -329,9 +333,9 @@ class VimeoIE(VimeoBaseInfoExtractor):
'id': '54469442',
'ext': 'mp4',
'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',
- 'uploader': 'The BLN & Business of Software',
- 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',
- 'uploader_id': 'theblnbusinessofsoftware',
+ 'uploader': 'Business of Software',
+ 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/businessofsoftware',
+ 'uploader_id': 'businessofsoftware',
'duration': 3610,
'description': None,
},
@@ -394,6 +398,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
+ 'subtitles': {
+ 'de': [{'ext': 'vtt'}],
+ 'en': [{'ext': 'vtt'}],
+ 'es': [{'ext': 'vtt'}],
+ 'fr': [{'ext': 'vtt'}],
+ },
}
},
{
@@ -466,6 +476,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'skip_download': True,
},
'expected_warnings': ['Unable to download JSON metadata'],
+ 'skip': 'this page is no longer available.',
},
{
'url': 'http://player.vimeo.com/video/68375962',
@@ -509,6 +520,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
{
'url': 'https://vimeo.com/160743502/abd0e13fb4',
'only_matching': True,
+ },
+ {
+ # requires passing unlisted_hash(a52724358e) to load_download_config request
+ 'url': 'https://vimeo.com/392479337/a52724358e',
+ 'only_matching': True,
}
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
@@ -543,9 +559,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
return urls[0] if urls else None
def _verify_player_video_password(self, url, video_id, headers):
- password = self._downloader.params.get('videopassword')
- if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+ password = self._get_video_password()
data = urlencode_postdata({
'password': base64.b64encode(password.encode()),
})
@@ -562,6 +576,37 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_initialize(self):
self._login()
+ def _extract_from_api(self, video_id, unlisted_hash=None):
+ token = self._download_json(
+ 'https://vimeo.com/_rv/jwt', video_id, headers={
+ 'X-Requested-With': 'XMLHttpRequest'
+ })['token']
+ api_url = 'https://api.vimeo.com/videos/' + video_id
+ if unlisted_hash:
+ api_url += ':' + unlisted_hash
+ video = self._download_json(
+ api_url, video_id, headers={
+ 'Authorization': 'jwt ' + token,
+ }, query={
+ 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
+ })
+ info = self._parse_config(self._download_json(
+ video['config_url'], video_id), video_id)
+ self._vimeo_sort_formats(info['formats'])
+ get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
+ info.update({
+ 'description': video.get('description'),
+ 'license': video.get('license'),
+ 'release_timestamp': get_timestamp('release'),
+ 'timestamp': get_timestamp('created'),
+ 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
+ })
+ connections = try_get(
+ video, lambda x: x['metadata']['connections'], dict) or {}
+ for k in ('comment', 'like'):
+ info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
+ return info
+
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
headers = std_headers.copy()
@@ -570,22 +615,19 @@ class VimeoIE(VimeoBaseInfoExtractor):
if 'Referer' not in headers:
headers['Referer'] = url
- channel_id = self._search_regex(
- r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
+ mobj = re.match(self._VALID_URL, url).groupdict()
+ video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash')
+ if unlisted_hash:
+ return self._extract_from_api(video_id, unlisted_hash)
- # Extract ID from URL
- video_id = self._match_id(url)
orig_url = url
is_pro = 'vimeopro.com/' in url
- is_player = '://player.vimeo.com/video/' in url
if is_pro:
# some videos require portfolio_id to be present in player url
# https://github.com/ytdl-org/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
if not url:
url = 'https://vimeo.com/' + video_id
- elif is_player:
- url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
@@ -605,14 +647,25 @@ class VimeoIE(VimeoBaseInfoExtractor):
expected=True)
raise
- # Now we begin extracting as much information as we can from what we
- # retrieved. First we extract the information common to all extractors,
- # and latter we extract those that are Vimeo specific.
- self.report_extraction(video_id)
+ if '//player.vimeo.com/video/' in url:
+ config = self._parse_json(self._search_regex(
+ r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
+ if config.get('view') == 4:
+ config = self._verify_player_video_password(
+ redirect_url, video_id, headers)
+ info = self._parse_config(config, video_id)
+ self._vimeo_sort_formats(info['formats'])
+ return info
+
+ if re.search(r'<form[^>]+?id="pw_form"', webpage):
+ video_password = self._get_video_password()
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ webpage = self._verify_video_password(
+ redirect_url, video_id, video_password, token, vuid)
vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
if vimeo_config:
- seed_status = vimeo_config.get('seed_status', {})
+ seed_status = vimeo_config.get('seed_status') or {}
if seed_status.get('state') == 'failed':
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, seed_status['title']),
@@ -621,66 +674,40 @@ class VimeoIE(VimeoBaseInfoExtractor):
cc_license = None
timestamp = None
video_description = None
+ info_dict = {}
- # Extract the config JSON
- try:
- try:
- config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage,
- 'config URL', default=None)
- if not config_url:
- # Sometimes new react-based page is served instead of old one that require
- # different config URL extraction approach (see
- # https://github.com/ytdl-org/youtube-dl/pull/7209)
- page_config = self._parse_json(self._search_regex(
- r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
- webpage, 'page config'), video_id)
- config_url = page_config['player']['config_url']
- cc_license = page_config.get('cc_license')
- timestamp = try_get(
- page_config, lambda x: x['clip']['uploaded_on'],
- compat_str)
- video_description = clean_html(dict_get(
- page_config, ('description', 'description_html_escaped')))
- config = self._download_json(config_url, video_id)
- except RegexNotFoundError:
- # For pro videos or player.vimeo.com urls
- # We try to find out to which variable is assigned the config dic
- m_variable_name = re.search(r'(\w)\.video\.id', webpage)
- if m_variable_name is not None:
- config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
- else:
- config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
- config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
- config_re.append(r'\bconfig\s*=\s*({.+?})\s*;')
- config = self._search_regex(config_re, webpage, 'info section',
- flags=re.DOTALL)
- config = json.loads(config)
- except Exception as e:
- if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
- raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
-
- if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
- if '_video_password_verified' in data:
- raise ExtractorError('video password verification failed!')
- self._verify_video_password(redirect_url, video_id, webpage)
- return self._real_extract(
- smuggle_url(redirect_url, {'_video_password_verified': 'verified'}))
- else:
- raise ExtractorError('Unable to extract info section',
- cause=e)
+ channel_id = self._search_regex(
+ r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
+ if channel_id:
+ config_url = self._html_search_regex(
+ r'\bdata-config-url="([^"]+)"', webpage, 'config URL')
+ video_description = clean_html(get_element_by_class('description', webpage))
+ info_dict.update({
+ 'channel_id': channel_id,
+ 'channel_url': 'https://vimeo.com/channels/' + channel_id,
+ })
else:
- if config.get('view') == 4:
- config = self._verify_player_video_password(redirect_url, video_id, headers)
-
- vod = config.get('video', {}).get('vod', {})
+ page_config = self._parse_json(self._search_regex(
+ r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
+ webpage, 'page config', default='{}'), video_id, fatal=False)
+ if not page_config:
+ return self._extract_from_api(video_id)
+ config_url = page_config['player']['config_url']
+ cc_license = page_config.get('cc_license')
+ clip = page_config.get('clip') or {}
+ timestamp = clip.get('uploaded_on')
+ video_description = clean_html(
+ clip.get('description') or page_config.get('description_html_escaped'))
+ config = self._download_json(config_url, video_id)
+ video = config.get('video') or {}
+ vod = video.get('vod') or {}
def is_rented():
if '>You rented this title.<' in webpage:
return True
- if config.get('user', {}).get('purchased'):
+ if try_get(config, lambda x: x['user']['purchased']):
return True
- for purchase_option in vod.get('purchase_options', []):
+ for purchase_option in (vod.get('purchase_options') or []):
if purchase_option.get('purchased'):
return True
label = purchase_option.get('label_string')
@@ -695,14 +722,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'https://player.vimeo.com/player/%s' % feature_id,
{'force_feature_id': True}), 'Vimeo')
- # Extract video description
- if not video_description:
- video_description = self._html_search_regex(
- r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
- webpage, 'description', default=None)
if not video_description:
video_description = self._html_search_meta(
- 'description', webpage, default=None)
+ ['description', 'og:description', 'twitter:description'],
+ webpage, default=None)
if not video_description and is_pro:
orig_webpage = self._download_webpage(
orig_url, video_id,
@@ -711,29 +734,18 @@ class VimeoIE(VimeoBaseInfoExtractor):
if orig_webpage:
video_description = self._html_search_meta(
'description', orig_webpage, default=None)
- if not video_description and not is_player:
+ if not video_description:
self._downloader.report_warning('Cannot find video description')
- # Extract upload date
if not timestamp:
timestamp = self._search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage,
'timestamp', default=None)
- try:
- view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
- like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
- comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
- except RegexNotFoundError:
- # This info is only available in vimeo.com/{id} urls
- view_count = None
- like_count = None
- comment_count = None
-
formats = []
source_format = self._extract_original_format(
- 'https://vimeo.com/' + video_id, video_id)
+ 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash'))
if source_format:
formats.append(source_format)
@@ -748,29 +760,20 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1',
webpage, 'license', default=None, group='license')
- channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None
-
- info_dict = {
+ info_dict.update({
'formats': formats,
'timestamp': unified_timestamp(timestamp),
'description': video_description,
'webpage_url': url,
- 'view_count': view_count,
- 'like_count': like_count,
- 'comment_count': comment_count,
'license': cc_license,
- 'channel_id': channel_id,
- 'channel_url': channel_url,
- }
-
- info_dict = merge_dicts(info_dict, info_dict_config, json_ld)
+ })
- return info_dict
+ return merge_dicts(info_dict, info_dict_config, json_ld)
class VimeoOndemandIE(VimeoIE):
IE_NAME = 'vimeo:ondemand'
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)'
_TESTS = [{
# ondemand video not available via https://vimeo.com/id
'url': 'https://vimeo.com/ondemand/20704',
@@ -931,11 +934,15 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
}
if hashed_pass:
query['_hashed_pass'] = hashed_pass
- videos = self._download_json(
- 'https://api.vimeo.com/albums/%s/videos' % album_id,
- album_id, 'Downloading page %d' % api_page, query=query, headers={
- 'Authorization': 'jwt ' + authorization,
- })['data']
+ try:
+ videos = self._download_json(
+ 'https://api.vimeo.com/albums/%s/videos' % album_id,
+ album_id, 'Downloading page %d' % api_page, query=query, headers={
+ 'Authorization': 'jwt ' + authorization,
+ })['data']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ return
for video in videos:
link = video.get('link')
if not link:
@@ -1050,10 +1057,23 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
page_url, video_id = re.match(self._VALID_URL, url).groups()
- clip_data = self._download_json(
- page_url.replace('/review/', '/review/data/'),
- video_id)['clipData']
- config_url = clip_data['configUrl']
+ data = self._download_json(
+ page_url.replace('/review/', '/review/data/'), video_id)
+ if data.get('isLocked') is True:
+ video_password = self._get_video_password()
+ viewer = self._download_json(
+ 'https://vimeo.com/_rv/viewer', video_id)
+ webpage = self._verify_video_password(
+ 'https://vimeo.com/' + video_id, video_id,
+ video_password, viewer['xsrft'], viewer['vuid'])
+ clip_page_config = self._parse_json(self._search_regex(
+ r'window\.vimeo\.clip_page_config\s*=\s*({.+?});',
+ webpage, 'clip page config'), video_id)
+ config_url = clip_page_config['player']['config_url']
+ clip_data = clip_page_config.get('clip') or {}
+ else:
+ clip_data = data['clipData']
+ config_url = clip_data['configUrl']
config = self._download_json(config_url, video_id)
info_dict = self._parse_config(config, video_id)
source_format = self._extract_original_format(
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 00ec006c4..6b3513ee0 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -300,6 +300,13 @@ class VKIE(VKBaseIE):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_sibnet_urls(webpage):
+ # https://help.sibnet.ru/?sibnet_video_embed
+ return [unescapeHTML(mobj.group('url')) for mobj in re.finditer(
+ r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1',
+ webpage)]
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
@@ -408,6 +415,10 @@ class VKIE(VKBaseIE):
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
+ sibnet_urls = self._extract_sibnet_urls(info_page)
+ if sibnet_urls:
+ return self.url_result(sibnet_urls[0])
+
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts:
m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index 6224e6200..42da34d44 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -106,7 +106,7 @@ class VLiveIE(VLiveBaseIE):
raise ExtractorError('Unable to log in', expected=True)
def _call_api(self, path_template, video_id, fields=None):
- query = {'appId': self._APP_ID, 'gcc': 'KR'}
+ query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'}
if fields:
query['fields'] = fields
try:
@@ -116,7 +116,7 @@ class VLiveIE(VLiveBaseIE):
headers={'Referer': 'https://www.vlive.tv/'}, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- self.raise_login_required(json.loads(e.cause.read().decode())['message'])
+ self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message'])
raise
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py
index b318e15d4..661208125 100644
--- a/youtube_dl/extractor/voxmedia.py
+++ b/youtube_dl/extractor/voxmedia.py
@@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
int_or_none,
+ try_get,
+ unified_timestamp,
)
@@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE):
setup = self._parse_json(self._search_regex(
r'setup\s*=\s*({.+});', webpage, 'setup'), video_id)
- video_data = setup.get('video') or {}
+ player_setup = setup.get('player_setup') or setup
+ video_data = player_setup.get('video') or {}
+ formatted_metadata = video_data.get('formatted_metadata') or {}
info = {
'id': video_id,
- 'title': video_data.get('title_short'),
+ 'title': player_setup.get('title') or video_data.get('title_short'),
'description': video_data.get('description_long') or video_data.get('description_short'),
- 'thumbnail': video_data.get('brightcove_thumbnail')
+ 'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'),
+ 'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')),
}
- asset = setup.get('asset') or setup.get('params') or {}
+ asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {}
formats = []
hls_url = asset.get('hls_url')
@@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE):
if formats:
self._sort_formats(formats)
info['formats'] = formats
+ info['duration'] = int_or_none(asset.get('duration'))
return info
for provider_video_type in ('ooyala', 'youtube', 'brightcove'):
@@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor):
}, {
# Volume embed, Youtube
'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
- 'md5': '4c8f4a0937752b437c3ebc0ed24802b5',
+ 'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68',
'info_dict': {
'id': 'Gy8Md3Eky38',
'ext': 'mp4',
@@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor):
'uploader_id': 'TheVerge',
'upload_date': '20141021',
'uploader': 'The Verge',
+ 'timestamp': 1413907200,
},
'add_ie': ['Youtube'],
'skip': 'similar to the previous test',
@@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor):
# Volume embed, Youtube
'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
'info_dict': {
- 'id': 'YCjDnX-Xzhg',
+ 'id': '22986359b',
'ext': 'mp4',
'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination",
'description': 'md5:fc1317922057de31cd74bce91eb1c66c',
- 'uploader_id': 'voxdotcom',
'upload_date': '20150915',
- 'uploader': 'Vox',
+ 'timestamp': 1442332800,
+ 'duration': 285,
},
'add_ie': ['Youtube'],
'skip': 'similar to the previous test',
@@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella',
'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.',
+ 'timestamp': 1402938000,
+ 'upload_date': '20140616',
+ 'duration': 4114,
},
'add_ie': ['VoxMediaVolume'],
}]
diff --git a/youtube_dl/extractor/vtm.py b/youtube_dl/extractor/vtm.py
new file mode 100644
index 000000000..093f1aa69
--- /dev/null
+++ b/youtube_dl/extractor/vtm.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ try_get,
+)
+
+
+class VTMIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P<id>[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})'
+ _TEST = {
+ 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1',
+ 'md5': '37dca85fbc3a33f2de28ceb834b071f8',
+ 'info_dict': {
+ 'id': '192445',
+ 'ext': 'mp4',
+ 'title': 'Gast vernielt Genkse hotelkamer',
+ 'timestamp': 1611060180,
+ 'upload_date': '20210119',
+ 'duration': 74,
+ # TODO: fix url _type result processing
+ # 'series': 'Op Interventie',
+ }
+ }
+
+ def _real_extract(self, url):
+ uuid = self._match_id(url)
+ video = self._download_json(
+ 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql',
+ uuid, query={
+ 'query': '''{
+ getComponent(type: Video, uuid: "%s") {
+ ... on Video {
+ description
+ duration
+ myChannelsVideo
+ program {
+ title
+ }
+ publishedAt
+ title
+ }
+ }
+}''' % uuid,
+ }, headers={
+ 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e',
+ })['data']['getComponent']
+
+ return {
+ '_type': 'url',
+ 'id': uuid,
+ 'title': video.get('title'),
+ 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'],
+ 'description': video.get('description'),
+ 'timestamp': parse_iso8601(video.get('publishedAt')),
+ 'duration': int_or_none(video.get('duration')),
+ 'series': try_get(video, lambda x: x['program']['title']),
+ 'ie_key': 'Medialaan',
+ }
diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py
index f4cae7fe9..bc196f8a0 100644
--- a/youtube_dl/extractor/vvvvid.py
+++ b/youtube_dl/extractor/vvvvid.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
ExtractorError,
int_or_none,
@@ -48,6 +49,22 @@ class VVVVIDIE(InfoExtractor):
'skip_download': True,
},
}, {
+ # video_type == 'video/youtube'
+ 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer',
+ 'md5': '33e0edfba720ad73a8782157fdebc648',
+ 'info_dict': {
+ 'id': 'RzmFKUDOUgw',
+ 'ext': 'mp4',
+ 'title': 'Trailer',
+ 'upload_date': '20150906',
+ 'description': 'md5:a5e802558d35247fee285875328c0b80',
+ 'uploader_id': 'BandaiVisual',
+ 'uploader': 'BANDAI NAMCO Arts Channel',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048',
'only_matching': True
}]
@@ -58,12 +75,15 @@ class VVVVIDIE(InfoExtractor):
'https://www.vvvvid.it/user/login',
None, headers=self.geo_verification_headers())['data']['conn_id']
- def _download_info(self, show_id, path, video_id, fatal=True):
+ def _download_info(self, show_id, path, video_id, fatal=True, query=None):
+ q = {
+ 'conn_id': self._conn_id,
+ }
+ if query:
+ q.update(query)
response = self._download_json(
'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path),
- video_id, headers=self.geo_verification_headers(), query={
- 'conn_id': self._conn_id,
- }, fatal=fatal)
+ video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal)
if not (response or fatal):
return
if response.get('result') == 'error':
@@ -81,7 +101,8 @@ class VVVVIDIE(InfoExtractor):
show_id, season_id, video_id = re.match(self._VALID_URL, url).groups()
response = self._download_info(
- show_id, 'season/%s' % season_id, video_id)
+ show_id, 'season/%s' % season_id,
+ video_id, query={'video_id': video_id})
vid = int(video_id)
video_data = list(filter(
@@ -154,37 +175,50 @@ class VVVVIDIE(InfoExtractor):
if season_number:
info['season_number'] = int(season_number)
- for quality in ('_sd', ''):
+ video_type = video_data.get('video_type')
+ is_youtube = False
+ for quality in ('', '_sd'):
embed_code = video_data.get('embed_info' + quality)
if not embed_code:
continue
embed_code = ds(embed_code)
- video_type = video_data.get('video_type')
- if video_type in ('video/rcs', 'video/kenc'):
- if video_type == 'video/kenc':
- kenc = self._download_json(
- 'https://www.vvvvid.it/kenc', video_id, query={
- 'action': 'kt',
- 'conn_id': self._conn_id,
- 'url': embed_code,
- }, fatal=False) or {}
- kenc_message = kenc.get('message')
- if kenc_message:
- embed_code += '?' + ds(kenc_message)
+ if video_type == 'video/kenc':
+ embed_code = re.sub(r'https?(://[^/]+)/z/', r'https\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8')
+ kenc = self._download_json(
+ 'https://www.vvvvid.it/kenc', video_id, query={
+ 'action': 'kt',
+ 'conn_id': self._conn_id,
+ 'url': embed_code,
+ }, fatal=False) or {}
+ kenc_message = kenc.get('message')
+ if kenc_message:
+ embed_code += '?' + ds(kenc_message)
+ formats.extend(self._extract_m3u8_formats(
+ embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ elif video_type == 'video/rcs':
formats.extend(self._extract_akamai_formats(embed_code, video_id))
+ elif video_type == 'video/youtube':
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': YoutubeIE.ie_key(),
+ 'url': embed_code,
+ })
+ is_youtube = True
+ break
else:
formats.extend(self._extract_wowza_formats(
'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id))
metadata_from_url(embed_code)
- self._sort_formats(formats)
+ if not is_youtube:
+ self._sort_formats(formats)
+ info['formats'] = formats
metadata_from_url(video_data.get('thumbnail'))
info.update(self._extract_common_video_info(video_data))
info.update({
'id': video_id,
'title': title,
- 'formats': formats,
'duration': int_or_none(video_data.get('length')),
'series': video_data.get('show_title'),
'season_id': season_id,
@@ -220,9 +254,13 @@ class VVVVIDShowIE(VVVVIDIE):
show_info = self._download_info(
show_id, 'info/', show_title, fatal=False)
+ if not show_title:
+ base_url += "/title"
+
entries = []
for season in (seasons or []):
episodes = season.get('episodes') or []
+ playlist_title = season.get('name') or show_info.get('title')
for episode in episodes:
if episode.get('playable') is False:
continue
@@ -232,12 +270,13 @@ class VVVVIDShowIE(VVVVIDIE):
continue
info = self._extract_common_video_info(episode)
info.update({
- '_type': 'url',
+ '_type': 'url_transparent',
'ie_key': VVVVIDIE.ie_key(),
'url': '/'.join([base_url, season_id, video_id]),
'title': episode.get('title'),
'description': episode.get('description'),
'season_id': season_id,
+ 'playlist_title': playlist_title,
})
entries.append(info)
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py
index 8ef3e0906..f1bccc2d6 100644
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -1,15 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
ExtractorError,
- unified_strdate,
- HEADRequest,
int_or_none,
+ try_get,
+ unified_strdate,
)
@@ -32,6 +30,7 @@ class WatIE(InfoExtractor):
'skip_download': True,
},
'expected_warnings': ['HTTP Error 404'],
+ 'skip': 'This content is no longer available',
},
{
'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
@@ -43,17 +42,10 @@ class WatIE(InfoExtractor):
'upload_date': '20140816',
},
'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
+ 'skip': 'This content is no longer available',
},
]
-
- _FORMATS = (
- (200, 416, 234),
- (400, 480, 270),
- (600, 640, 360),
- (1200, 640, 360),
- (1800, 960, 540),
- (2500, 1280, 720),
- )
+ _GEO_BYPASS = False
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -61,97 +53,54 @@ class WatIE(InfoExtractor):
# 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them
+ # video_data = self._download_json(
+ # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
video_data = self._download_json(
- 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
+ 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id,
+ video_id, query={'context': 'MYTF1'})
video_info = video_data['media']
error_desc = video_info.get('error_desc')
if error_desc:
- self.report_warning(
- '%s returned error: %s' % (self.IE_NAME, error_desc))
-
- chapters = video_info['chapters']
- if chapters:
- first_chapter = chapters[0]
-
- def video_id_for_chapter(chapter):
- return chapter['tc_start'].split('-')[0]
-
- if video_id_for_chapter(first_chapter) != video_id:
- self.to_screen('Multipart video detected')
- entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
- return self.playlist_result(entries, video_id, video_info['title'])
- # Otherwise we can continue and extract just one part, we have to use
- # the video id for getting the video url
- else:
- first_chapter = video_info
-
- title = first_chapter['title']
+ if video_info.get('error_code') == 'GEOBLOCKED':
+ self.raise_geo_restricted(error_desc, video_info.get('geoList'))
+ raise ExtractorError(error_desc, expected=True)
- def extract_url(path_template, url_type):
- req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
- head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False)
- if head:
- red_url = head.geturl()
- if req_url != red_url:
- return red_url
- return None
-
- def remove_bitrate_limit(manifest_url):
- return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url)
+ title = video_info['title']
formats = []
- try:
- alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')]
- manifest_urls = self._download_json(
- 'http://www.wat.tv/get/webhtml/' + video_id, video_id)
- m3u8_url = manifest_urls.get('hls')
- if m3u8_url:
- m3u8_url = remove_bitrate_limit(m3u8_url)
- for m3u8_alt_url in alt_urls(m3u8_url):
+
+ def extract_formats(manifest_urls):
+ for f, f_url in manifest_urls.items():
+ if not f_url:
+ continue
+ if f in ('dash', 'mpd'):
+ formats.extend(self._extract_mpd_formats(
+ f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
+ video_id, mpd_id='dash', fatal=False))
+ elif f == 'hls':
formats.extend(self._extract_m3u8_formats(
- m3u8_alt_url, video_id, 'mp4',
+ f_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
- formats.extend(self._extract_f4m_formats(
- m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'),
- video_id, f4m_id='hds', fatal=False))
- mpd_url = manifest_urls.get('mpd')
- if mpd_url:
- mpd_url = remove_bitrate_limit(mpd_url)
- for mpd_alt_url in alt_urls(mpd_url):
- formats.extend(self._extract_mpd_formats(
- mpd_alt_url, video_id, mpd_id='dash', fatal=False))
- self._sort_formats(formats)
- except ExtractorError:
- abr = 64
- for vbr, width, height in self._FORMATS:
- tbr = vbr + abr
- format_id = 'http-%s' % tbr
- fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
- if self._is_valid_url(fmt_url, video_id, format_id):
- formats.append({
- 'format_id': format_id,
- 'url': fmt_url,
- 'vbr': vbr,
- 'abr': abr,
- 'width': width,
- 'height': height,
- })
- date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
- upload_date = unified_strdate(date_diffusion) if date_diffusion else None
- duration = None
- files = video_info['files']
- if files:
- duration = int_or_none(files[0].get('duration'))
+ delivery = video_data.get('delivery') or {}
+ extract_formats({delivery.get('format'): delivery.get('url')})
+ if not formats:
+ if delivery.get('drm'):
+ raise ExtractorError('This video is DRM protected.', expected=True)
+ manifest_urls = self._download_json(
+ 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False)
+ if manifest_urls:
+ extract_formats(manifest_urls)
+
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
- 'thumbnail': first_chapter.get('preview'),
- 'description': first_chapter.get('description'),
- 'view_count': int_or_none(video_info.get('views')),
- 'upload_date': upload_date,
- 'duration': duration,
+ 'thumbnail': video_info.get('preview'),
+ 'upload_date': unified_strdate(try_get(
+ video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),
+ 'duration': int_or_none(video_info.get('duration')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py
index d9c277bc3..25f487e1e 100644
--- a/youtube_dl/extractor/xboxclips.py
+++ b/youtube_dl/extractor/xboxclips.py
@@ -1,40 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
int_or_none,
+ month_by_abbreviation,
parse_filesize,
- unified_strdate,
)
class XboxClipsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
+ _TESTS = [{
'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
'ext': 'mp4',
- 'title': 'Iabdulelah playing Titanfall',
+ 'title': 'iAbdulElah playing Titanfall',
'filesize_approx': 26800000,
'upload_date': '20140807',
'duration': 56,
}
- }
+ }, {
+ 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
+ if '/video.php' in url:
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0])
+
webpage = self._download_webpage(url, video_id)
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
- video_url = self._html_search_regex(
- r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL')
- title = self._html_search_regex(
- r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
- upload_date = unified_strdate(self._html_search_regex(
- r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
+ title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
+ upload_date = None
+ mobj = re.search(
+ r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})',
+ webpage)
+ if mobj:
+ upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1))
filesize = parse_filesize(self._html_search_regex(
r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
duration = int_or_none(self._html_search_regex(
@@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor):
view_count = int_or_none(self._html_search_regex(
r'>Views: (\d+)<', webpage, 'view count', fatal=False))
- return {
+ info.update({
'id': video_id,
- 'url': video_url,
'title': title,
'upload_date': upload_date,
'filesize_approx': filesize,
'duration': duration,
'view_count': view_count,
- }
+ })
+ return info
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index cbd5d1cbb..df9efa9fa 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -58,6 +58,7 @@ class XFileShareIE(InfoExtractor):
(r'vidlocker\.xyz', 'VidLocker'),
(r'vidshare\.tv', 'VidShare'),
(r'vup\.to', 'VUp'),
+ (r'wolfstream\.tv', 'WolfStream'),
(r'xvideosharing\.com', 'XVideoSharing'),
)
@@ -82,6 +83,9 @@ class XFileShareIE(InfoExtractor):
}, {
'url': 'https://aparat.cam/n4d6dh0wvlpr',
'only_matching': True,
+ }, {
+ 'url': 'https://wolfstream.tv/nthme29v9u2x',
+ 'only_matching': True,
}]
@staticmethod
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 76aeaf9a4..f73b9778f 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -11,11 +11,14 @@ from ..utils import (
dict_get,
extract_attributes,
ExtractorError,
+ float_or_none,
int_or_none,
parse_duration,
+ str_or_none,
try_get,
unified_strdate,
url_or_none,
+ urljoin,
)
@@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor):
video = initials['videoModel']
title = video['title']
formats = []
- for format_id, formats_dict in video['sources'].items():
+ format_urls = set()
+ format_sizes = {}
+ sources = try_get(video, lambda x: x['sources'], dict) or {}
+ for format_id, formats_dict in sources.items():
if not isinstance(formats_dict, dict):
continue
+ download_sources = try_get(sources, lambda x: x['download'], dict) or {}
+ for quality, format_dict in download_sources.items():
+ if not isinstance(format_dict, dict):
+ continue
+ format_sizes[quality] = float_or_none(format_dict.get('size'))
for quality, format_item in formats_dict.items():
if format_id == 'download':
# Download link takes some time to be generated,
# skipping for now
continue
- if not isinstance(format_item, dict):
- continue
- format_url = format_item.get('link')
- filesize = int_or_none(
- format_item.get('size'), invscale=1000000)
- else:
- format_url = format_item
- filesize = None
+ format_url = format_item
format_url = url_or_none(format_url)
- if not format_url:
+ if not format_url or format_url in format_urls:
continue
+ format_urls.add(format_url)
formats.append({
'format_id': '%s-%s' % (format_id, quality),
'url': format_url,
'ext': determine_ext(format_url, 'mp4'),
'height': get_height(quality),
- 'filesize': filesize,
+ 'filesize': format_sizes.get(quality),
'http_headers': {
'Referer': urlh.geturl(),
},
})
- self._sort_formats(formats)
+ xplayer_sources = try_get(
+ initials, lambda x: x['xplayerSettings']['sources'], dict)
+ if xplayer_sources:
+ hls_sources = xplayer_sources.get('hls')
+ if isinstance(hls_sources, dict):
+ for hls_format_key in ('url', 'fallback'):
+ hls_url = hls_sources.get(hls_format_key)
+ if not hls_url:
+ continue
+ hls_url = urljoin(url, hls_url)
+ if not hls_url or hls_url in format_urls:
+ continue
+ format_urls.add(hls_url)
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ standard_sources = xplayer_sources.get('standard')
+ if isinstance(standard_sources, dict):
+ for format_id, formats_list in standard_sources.items():
+ if not isinstance(formats_list, list):
+ continue
+ for standard_format in formats_list:
+ if not isinstance(standard_format, dict):
+ continue
+ for standard_format_key in ('url', 'fallback'):
+ standard_url = standard_format.get(standard_format_key)
+ if not standard_url:
+ continue
+ standard_url = urljoin(url, standard_url)
+ if not standard_url or standard_url in format_urls:
+ continue
+ format_urls.add(standard_url)
+ ext = determine_ext(standard_url, 'mp4')
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ standard_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ continue
+ quality = (str_or_none(standard_format.get('quality'))
+ or str_or_none(standard_format.get('label'))
+ or '')
+ formats.append({
+ 'format_id': '%s-%s' % (format_id, quality),
+ 'url': standard_url,
+ 'ext': ext,
+ 'height': get_height(quality),
+ 'filesize': format_sizes.get(quality),
+ 'http_headers': {
+ 'Referer': standard_url,
+ },
+ })
+ self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
categories_list = video.get('categories')
if isinstance(categories_list, list):
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 18969058f..7246409e3 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -11,6 +11,7 @@ from ..utils import (
parse_duration,
sanitized_Request,
str_to_int,
+ url_or_none,
)
@@ -87,10 +88,10 @@ class XTubeIE(InfoExtractor):
'Cookie': 'age_verified=1; cookiesAccepted=1',
})
- title, thumbnail, duration = [None] * 3
+ title, thumbnail, duration, sources, media_definition = [None] * 5
config = self._parse_json(self._search_regex(
- r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config',
+ r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf|playerWrapper)', webpage, 'config',
default='{}'), video_id, transform_source=js_to_json, fatal=False)
if config:
config = config.get('mainRoll')
@@ -99,20 +100,52 @@ class XTubeIE(InfoExtractor):
thumbnail = config.get('poster')
duration = int_or_none(config.get('duration'))
sources = config.get('sources') or config.get('format')
+ media_definition = config.get('mediaDefinition')
- if not isinstance(sources, dict):
+ if not isinstance(sources, dict) and not media_definition:
sources = self._parse_json(self._search_regex(
r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
webpage, 'sources', group='sources'), video_id,
transform_source=js_to_json)
formats = []
- for format_id, format_url in sources.items():
- formats.append({
- 'url': format_url,
- 'format_id': format_id,
- 'height': int_or_none(format_id),
- })
+ format_urls = set()
+
+ if isinstance(sources, dict):
+ for format_id, format_url in sources.items():
+ format_url = url_or_none(format_url)
+ if not format_url:
+ continue
+ if format_url in format_urls:
+ continue
+ format_urls.add(format_url)
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': int_or_none(format_id),
+ })
+
+ if isinstance(media_definition, list):
+ for media in media_definition:
+ video_url = url_or_none(media.get('videoUrl'))
+ if not video_url:
+ continue
+ if video_url in format_urls:
+ continue
+ format_urls.add(video_url)
+ format_id = media.get('format')
+ if format_id == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif format_id == 'mp4':
+ height = int_or_none(media.get('quality'))
+ formats.append({
+ 'url': video_url,
+ 'format_id': '%s-%d' % (format_id, height) if height else format_id,
+ 'height': height,
+ })
+
self._remove_duplicate_formats(formats)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index e4615376c..a17b10d6e 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -177,46 +177,9 @@ class YahooIE(InfoExtractor):
'only_matching': True,
}]
- def _real_extract(self, url):
- url, country, display_id = re.match(self._VALID_URL, url).groups()
- if not country:
- country = 'us'
- else:
- country = country.split('-')[0]
- api_base = 'https://%s.yahoo.com/_td/api/resource/' % country
-
- for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]):
- content = self._download_json(
- api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid,
- display_id, 'Downloading content JSON metadata', fatal=i == 1)
- if content:
- item = content['items'][0]
- break
-
- if item.get('type') != 'video':
- entries = []
-
- cover = item.get('cover') or {}
- if cover.get('type') == 'yvideo':
- cover_url = cover.get('url')
- if cover_url:
- entries.append(self.url_result(
- cover_url, 'Yahoo', cover.get('uuid')))
-
- for e in item.get('body', []):
- if e.get('type') == 'videoIframe':
- iframe_url = e.get('url')
- if not iframe_url:
- continue
- entries.append(self.url_result(iframe_url))
-
- return self.playlist_result(
- entries, item.get('uuid'),
- item.get('title'), item.get('summary'))
-
- video_id = item['uuid']
+ def _extract_yahoo_video(self, video_id, country):
video = self._download_json(
- api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id,
+ 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id),
video_id, 'Downloading video JSON metadata')[0]
title = video['title']
@@ -298,7 +261,6 @@ class YahooIE(InfoExtractor):
'id': video_id,
'title': self._live_title(title) if is_live else title,
'formats': formats,
- 'display_id': display_id,
'thumbnails': thumbnails,
'description': clean_html(video.get('description')),
'timestamp': parse_iso8601(video.get('publish_time')),
@@ -311,6 +273,44 @@ class YahooIE(InfoExtractor):
'episode_number': int_or_none(series_info.get('episode_number')),
}
+ def _real_extract(self, url):
+ url, country, display_id = re.match(self._VALID_URL, url).groups()
+ if not country:
+ country = 'us'
+ else:
+ country = country.split('-')[0]
+
+ item = self._download_json(
+ 'https://%s.yahoo.com/caas/content/article' % country, display_id,
+ 'Downloading content JSON metadata', query={
+ 'url': url
+ })['items'][0]['data']['partnerData']
+
+ if item.get('type') != 'video':
+ entries = []
+
+ cover = item.get('cover') or {}
+ if cover.get('type') == 'yvideo':
+ cover_url = cover.get('url')
+ if cover_url:
+ entries.append(self.url_result(
+ cover_url, 'Yahoo', cover.get('uuid')))
+
+ for e in (item.get('body') or []):
+ if e.get('type') == 'videoIframe':
+ iframe_url = e.get('url')
+ if not iframe_url:
+ continue
+ entries.append(self.url_result(iframe_url))
+
+ return self.playlist_result(
+ entries, item.get('uuid'),
+ item.get('title'), item.get('summary'))
+
+ info = self._extract_yahoo_video(item['uuid'], country)
+ info['display_id'] = display_id
+ return info
+
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = 'Yahoo screen search'
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index 7893f363e..84969f8e1 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import hashlib
+import itertools
+import re
from .common import InfoExtractor
from ..compat import compat_str
@@ -209,17 +210,27 @@ class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
missing_track_ids = [
track_id for track_id in track_ids
if track_id not in present_track_ids]
- missing_tracks = self._call_api(
- 'track-entries', tld, url, item_id,
- 'Downloading missing tracks JSON', {
- 'entries': ','.join(missing_track_ids),
- 'lang': tld,
- 'external-domain': 'music.yandex.%s' % tld,
- 'overembed': 'false',
- 'strict': 'true',
- })
- if missing_tracks:
- tracks.extend(missing_tracks)
+ # Request missing tracks in chunks to avoid exceeding max HTTP header size,
+ # see https://github.com/ytdl-org/youtube-dl/issues/27355
+ _TRACKS_PER_CHUNK = 250
+ for chunk_num in itertools.count(0):
+ start = chunk_num * _TRACKS_PER_CHUNK
+ end = start + _TRACKS_PER_CHUNK
+ missing_track_ids_req = missing_track_ids[start:end]
+ assert missing_track_ids_req
+ missing_tracks = self._call_api(
+ 'track-entries', tld, url, item_id,
+ 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), {
+ 'entries': ','.join(missing_track_ids_req),
+ 'lang': tld,
+ 'external-domain': 'music.yandex.%s' % tld,
+ 'overembed': 'false',
+ 'strict': 'true',
+ })
+ if missing_tracks:
+ tracks.extend(missing_tracks)
+ if end >= len(missing_track_ids):
+ break
return tracks
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 61d1ab209..880c89687 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor):
# request basic data
basic_data_params = {
'vid': video_id,
- 'ccode': '0590',
+ 'ccode': '0532',
'client_ip': '192.168.1.1',
'utid': cna,
'client_ts': time.time() / 1000,
diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py
index 7b9feafeb..7084d3d12 100644
--- a/youtube_dl/extractor/youporn.py
+++ b/youtube_dl/extractor/youporn.py
@@ -4,13 +4,12 @@ import re
from .common import InfoExtractor
from ..utils import (
+ extract_attributes,
int_or_none,
str_to_int,
- unescapeHTML,
unified_strdate,
url_or_none,
)
-from ..aes import aes_decrypt_text
class YouPornIE(InfoExtractor):
@@ -25,6 +24,7 @@ class YouPornIE(InfoExtractor):
'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 210,
'uploader': 'Ask Dan And Jennifer',
'upload_date': '20101217',
'average_rating': int,
@@ -33,6 +33,7 @@ class YouPornIE(InfoExtractor):
'tags': list,
'age_limit': 18,
},
+ 'skip': 'This video has been disabled',
}, {
# Unknown uploader
'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
@@ -54,12 +55,16 @@ class YouPornIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': '404',
}, {
'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/',
'only_matching': True,
}, {
'url': 'http://www.youporn.com/watch/505835',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/',
+ 'only_matching': True,
}]
@staticmethod
@@ -73,74 +78,50 @@ class YouPornIE(InfoExtractor):
video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id
- webpage = self._download_webpage(
- 'http://www.youporn.com/watch/%s' % video_id, display_id,
- headers={'Cookie': 'age_verified=1'})
-
- title = self._html_search_regex(
- r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
- webpage, 'title', default=None) or self._og_search_title(
- webpage, default=None) or self._html_search_meta(
- 'title', webpage, fatal=True)
-
- links = []
-
- # Main source
- definitions = self._parse_json(
- self._search_regex(
- r'mediaDefinition\s*=\s*(\[.+?\]);', webpage,
- 'media definitions', default='[]'),
- video_id, fatal=False)
- if definitions:
- for definition in definitions:
- if not isinstance(definition, dict):
- continue
- video_url = url_or_none(definition.get('videoUrl'))
- if video_url:
- links.append(video_url)
-
- # Fallback #1, this also contains extra low quality 180p format
- for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
- links.append(link)
-
- # Fallback #2 (unavailable as at 22.06.2017)
- sources = self._search_regex(
- r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
- if sources:
- for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
- links.append(link)
-
- # Fallback #3 (unavailable as at 22.06.2017)
- for _, link in re.findall(
- r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
- links.append(link)
-
- # Fallback #4, encrypted links (unavailable as at 22.06.2017)
- for _, encrypted_link in re.findall(
- r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
- links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))
+ definitions = self._download_json(
+ 'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
+ display_id)
formats = []
- for video_url in set(unescapeHTML(link) for link in links):
+ for definition in definitions:
+ if not isinstance(definition, dict):
+ continue
+ video_url = url_or_none(definition.get('videoUrl'))
+ if not video_url:
+ continue
f = {
'url': video_url,
+ 'filesize': int_or_none(definition.get('videoSize')),
}
+ height = int_or_none(definition.get('quality'))
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
+ # /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
- mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)
+ mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
if mobj:
- height = int(mobj.group('height'))
+ if not height:
+ height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
f.update({
'format_id': '%dp-%dk' % (height, bitrate),
- 'height': height,
'tbr': bitrate,
})
+ f['height'] = height
formats.append(f)
self._sort_formats(formats)
+ webpage = self._download_webpage(
+ 'http://www.youporn.com/watch/%s' % video_id, display_id,
+ headers={'Cookie': 'age_verified=1'})
+
+ title = self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
+ webpage, 'title', default=None) or self._og_search_title(
+ webpage, default=None) or self._html_search_meta(
+ 'title', webpage, fatal=True)
+
description = self._html_search_regex(
r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>',
webpage, 'description',
@@ -149,6 +130,8 @@ class YouPornIE(InfoExtractor):
thumbnail = self._search_regex(
r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
webpage, 'thumbnail', fatal=False, group='thumbnail')
+ duration = int_or_none(self._html_search_meta(
+ 'video:duration', webpage, 'duration', fatal=False))
uploader = self._html_search_regex(
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
@@ -161,13 +144,12 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage)
- average_rating = int_or_none(self._search_regex(
- r'<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>',
- webpage, 'average rating', fatal=False))
-
- view_count = str_to_int(self._search_regex(
- r'(?s)<div[^>]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P<count>[\d,.]+)<',
- webpage, 'view count', fatal=False, group='count'))
+ view_count = None
+ views = self._search_regex(
+ r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage,
+ 'views', default=None)
+ if views:
+ view_count = str_to_int(extract_attributes(views).get('data-value'))
comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', default=None))
@@ -190,9 +172,9 @@ class YouPornIE(InfoExtractor):
'title': title,
'description': description,
'thumbnail': thumbnail,
+ 'duration': duration,
'uploader': uploader,
'upload_date': upload_date,
- 'average_rating': average_rating,
'view_count': view_count,
'comment_count': comment_count,
'categories': categories,
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index bbd3e80d8..7cd651c67 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -2,41 +2,35 @@
from __future__ import unicode_literals
-
import itertools
import json
import os.path
import random
import re
-import time
import traceback
from .common import InfoExtractor, SearchInfoExtractor
-from ..jsinterp import JSInterpreter
-from ..swfinterp import SWFInterpreter
from ..compat import (
compat_chr,
compat_HTTPError,
compat_parse_qs,
- compat_urllib_parse_unquote,
+ compat_str,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urlparse,
- compat_str,
)
+from ..jsinterp import JSInterpreter
from ..utils import (
- bool_or_none,
- clean_html,
- error_to_compat_str,
ExtractorError,
+ clean_html,
+ dict_get,
float_or_none,
- get_element_by_id,
int_or_none,
mimetype2ext,
parse_codecs,
parse_duration,
- remove_quotes,
+ qualities,
remove_start,
smuggle_url,
str_or_none,
@@ -46,13 +40,16 @@ from ..utils import (
unified_strdate,
unsmuggle_url,
update_url_query,
- uppercase_escape,
url_or_none,
urlencode_postdata,
urljoin,
)
+def parse_qs(url):
+ return compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+
+
class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
@@ -68,17 +65,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
- def _set_language(self):
- self._set_cookie(
- '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
- # YouTube sets the expire time to about two months
- expire_time=time.time() + 2 * 30 * 24 * 3600)
-
- def _ids_to_results(self, ids):
- return [
- self.url_result(vid_id, 'Youtube', video_id=vid_id)
- for vid_id in ids]
-
def _login(self):
"""
Attempt to log in to YouTube.
@@ -262,10 +248,25 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return True
+ def _initialize_consent(self):
+ cookies = self._get_cookies('https://www.youtube.com/')
+ if cookies.get('__Secure-3PSID'):
+ return
+ consent_id = None
+ consent = cookies.get('CONSENT')
+ if consent:
+ if 'YES' in consent.value:
+ return
+ consent_id = self._search_regex(
+ r'PENDING\+(\d+)', consent.value, 'consent', default=None)
+ if not consent_id:
+ consent_id = random.randint(100, 999)
+ self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id)
+
def _real_initialize(self):
+ self._initialize_consent()
if self._downloader is None:
return
- self._set_language()
if not self._login():
return
@@ -282,19 +283,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;'
_YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)'
- def _call_api(self, ep, query, video_id):
+ def _call_api(self, ep, query, video_id, fatal=True):
data = self._DEFAULT_API_DATA.copy()
data.update(query)
- response = self._download_json(
+ return self._download_json(
'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
note='Downloading API JSON', errnote='Unable to download API page',
- data=json.dumps(data).encode('utf8'),
+ data=json.dumps(data).encode('utf8'), fatal=fatal,
headers={'content-type': 'application/json'},
query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
- return response
-
def _extract_yt_initial_data(self, video_id, webpage):
return self._parse_json(
self._search_regex(
@@ -306,7 +305,39 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return self._parse_json(
self._search_regex(
r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
- default='{}'), video_id, fatal=False)
+ default='{}'), video_id, fatal=False) or {}
+
+ def _extract_video(self, renderer):
+ video_id = renderer['videoId']
+ title = try_get(
+ renderer,
+ (lambda x: x['title']['runs'][0]['text'],
+ lambda x: x['title']['simpleText']), compat_str)
+ description = try_get(
+ renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
+ compat_str)
+ duration = parse_duration(try_get(
+ renderer, lambda x: x['lengthText']['simpleText'], compat_str))
+ view_count_text = try_get(
+ renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+ view_count = str_to_int(self._search_regex(
+ r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
+ 'view count', default=None))
+ uploader = try_get(
+ renderer,
+ (lambda x: x['ownerText']['runs'][0]['text'],
+ lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
+ return {
+ '_type': 'url',
+ 'ie_key': YoutubeIE.ie_key(),
+ 'id': video_id,
+ 'url': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'uploader': uploader,
+ }
def _search_results(self, query, params):
data = {
@@ -337,43 +368,26 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
list)
if not slr_contents:
break
- isr_contents = try_get(
- slr_contents,
- lambda x: x[0]['itemSectionRenderer']['contents'],
- list)
- if not isr_contents:
- break
- for content in isr_contents:
- if not isinstance(content, dict):
- continue
- video = content.get('videoRenderer')
- if not isinstance(video, dict):
- continue
- video_id = video.get('videoId')
- if not video_id:
+ for slr_content in slr_contents:
+ isr_contents = try_get(
+ slr_content,
+ lambda x: x['itemSectionRenderer']['contents'],
+ list)
+ if not isr_contents:
continue
- title = try_get(video, lambda x: x['title']['runs'][0]['text'], compat_str)
- description = try_get(video, lambda x: x['descriptionSnippet']['runs'][0]['text'], compat_str)
- duration = parse_duration(try_get(video, lambda x: x['lengthText']['simpleText'], compat_str))
- view_count_text = try_get(video, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
- view_count = int_or_none(self._search_regex(
- r'^(\d+)', re.sub(r'\s', '', view_count_text),
- 'view count', default=None))
- uploader = try_get(video, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
- yield {
- '_type': 'url_transparent',
- 'ie_key': YoutubeIE.ie_key(),
- 'id': video_id,
- 'url': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'view_count': view_count,
- 'uploader': uploader,
- }
+ for content in isr_contents:
+ if not isinstance(content, dict):
+ continue
+ video = content.get('videoRenderer')
+ if not isinstance(video, dict):
+ continue
+ video_id = video.get('videoId')
+ if not video_id:
+ continue
+ yield self._extract_video(video)
token = try_get(
slr_contents,
- lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
+ lambda x: x[-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
compat_str)
if not token:
break
@@ -382,52 +396,76 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
+ _INVIDIOUS_SITES = (
+ # invidious-redirect websites
+ r'(?:www\.)?redirect\.invidious\.io',
+ r'(?:(?:www|dev)\.)?invidio\.us',
+ # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
+ r'(?:(?:www|no)\.)?invidiou\.sh',
+ r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
+ r'(?:www\.)?invidious\.kabi\.tk',
+ r'(?:www\.)?invidious\.13ad\.de',
+ r'(?:www\.)?invidious\.mastodon\.host',
+ r'(?:www\.)?invidious\.zapashcanon\.fr',
+ r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks',
+ r'(?:www\.)?invidious\.tinfoil-hat\.net',
+ r'(?:www\.)?invidious\.himiko\.cloud',
+ r'(?:www\.)?invidious\.reallyancient\.tech',
+ r'(?:www\.)?invidious\.tube',
+ r'(?:www\.)?invidiou\.site',
+ r'(?:www\.)?invidious\.site',
+ r'(?:www\.)?invidious\.xyz',
+ r'(?:www\.)?invidious\.nixnet\.xyz',
+ r'(?:www\.)?invidious\.048596\.xyz',
+ r'(?:www\.)?invidious\.drycat\.fr',
+ r'(?:www\.)?inv\.skyn3t\.in',
+ r'(?:www\.)?tube\.poal\.co',
+ r'(?:www\.)?tube\.connect\.cafe',
+ r'(?:www\.)?vid\.wxzm\.sx',
+ r'(?:www\.)?vid\.mint\.lgbt',
+ r'(?:www\.)?vid\.puffyan\.us',
+ r'(?:www\.)?yewtu\.be',
+ r'(?:www\.)?yt\.elukerio\.org',
+ r'(?:www\.)?yt\.lelux\.fi',
+ r'(?:www\.)?invidious\.ggc-project\.de',
+ r'(?:www\.)?yt\.maisputain\.ovh',
+ r'(?:www\.)?ytprivate\.com',
+ r'(?:www\.)?invidious\.13ad\.de',
+ r'(?:www\.)?invidious\.toot\.koeln',
+ r'(?:www\.)?invidious\.fdn\.fr',
+ r'(?:www\.)?watch\.nettohikari\.com',
+ r'(?:www\.)?invidious\.namazso\.eu',
+ r'(?:www\.)?invidious\.silkky\.cloud',
+ r'(?:www\.)?invidious\.exonip\.de',
+ r'(?:www\.)?invidious\.riverside\.rocks',
+ r'(?:www\.)?invidious\.blamefran\.net',
+ r'(?:www\.)?invidious\.moomoo\.de',
+ r'(?:www\.)?ytb\.trom\.tf',
+ r'(?:www\.)?yt\.cyberhost\.uk',
+ r'(?:www\.)?kgg2m7yk5aybusll\.onion',
+ r'(?:www\.)?qklhadlycap4cnod\.onion',
+ r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
+ r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
+ r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
+ r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
+ r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
+ r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
+ r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
+ r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
+ r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
+ r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion',
+ )
_VALID_URL = r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
- (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
- (?:www\.)?deturl\.com/www\.youtube\.com/|
- (?:www\.)?pwnyoutube\.com/|
- (?:www\.)?hooktube\.com/|
- (?:www\.)?yourepeat\.com/|
- tube\.majestyc\.net/|
- # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
- (?:(?:www|dev)\.)?invidio\.us/|
- (?:(?:www|no)\.)?invidiou\.sh/|
- (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
- (?:www\.)?invidious\.kabi\.tk/|
- (?:www\.)?invidious\.13ad\.de/|
- (?:www\.)?invidious\.mastodon\.host/|
- (?:www\.)?invidious\.zapashcanon\.fr/|
- (?:www\.)?invidious\.kavin\.rocks/|
- (?:www\.)?invidious\.tube/|
- (?:www\.)?invidiou\.site/|
- (?:www\.)?invidious\.site/|
- (?:www\.)?invidious\.xyz/|
- (?:www\.)?invidious\.nixnet\.xyz/|
- (?:www\.)?invidious\.drycat\.fr/|
- (?:www\.)?tube\.poal\.co/|
- (?:www\.)?tube\.connect\.cafe/|
- (?:www\.)?vid\.wxzm\.sx/|
- (?:www\.)?vid\.mint\.lgbt/|
- (?:www\.)?yewtu\.be/|
- (?:www\.)?yt\.elukerio\.org/|
- (?:www\.)?yt\.lelux\.fi/|
- (?:www\.)?invidious\.ggc-project\.de/|
- (?:www\.)?yt\.maisputain\.ovh/|
- (?:www\.)?invidious\.13ad\.de/|
- (?:www\.)?invidious\.toot\.koeln/|
- (?:www\.)?invidious\.fdn\.fr/|
- (?:www\.)?watch\.nettohikari\.com/|
- (?:www\.)?kgg2m7yk5aybusll\.onion/|
- (?:www\.)?qklhadlycap4cnod\.onion/|
- (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
- (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
- (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
- (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
- (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
- (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
- youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
+ (?:www\.)?deturl\.com/www\.youtube\.com|
+ (?:www\.)?pwnyoutube\.com|
+ (?:www\.)?hooktube\.com|
+ (?:www\.)?yourepeat\.com|
+ tube\.majestyc\.net|
+ %(invidious)s|
+ youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
@@ -442,129 +480,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
youtu\.be| # just youtu.be/xxxx
vid\.plus| # or vid.plus/xxxx
zwearz\.com/watch| # or zwearz.com/watch/xxxx
+ %(invidious)s
)/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
- (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
- (?!.*?\blist=
- (?:
- %(playlist_id)s| # combined list/video URLs are handled by the playlist IE
- WL # WL are handled by the watch later IE
- )
- )
+ (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
- $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
- _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
+ $""" % {
+ 'invidious': '|'.join(_INVIDIOUS_SITES),
+ }
_PLAYER_INFO_RE = (
- r'/(?P<id>[a-zA-Z0-9_-]{8,})/player_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?/base\.(?P<ext>[a-z]+)$',
- r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.(?P<ext>[a-z]+)$',
+ r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
+ r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
+ r'\b(?P<id>vfl[a-zA-Z0-9_-]+)\b.*?\.js$',
)
- _formats = {
- '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
- '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
- '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
- '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
- '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
- '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
- '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
- '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
- '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
- '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
- '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
- '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
- '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
- '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
- '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
-
-
- # 3D videos
- '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
- '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
- '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
- '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
- '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
- '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
- '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
-
- # Apple HTTP Live Streaming
- '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
- '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
- '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
- '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
- '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
- '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
- '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
- '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
-
- # DASH mp4 video
- '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
- '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
- '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
- '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
- '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
-
- # Dash mp4 audio
- '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
- '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
- '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
- '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
- '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
- '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
- '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
-
- # Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
- '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
- '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
- '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
- '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
- '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
- '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
- '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
-
- # Dash webm audio
- '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
- '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
-
- # Dash webm audio with opus inside
- '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
- '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
- '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
-
- # RTMP (unnamed)
- '_rtmp': {'protocol': 'rtmp'},
-
- # av01 video only formats sometimes served with "unknown" codecs
- '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
- }
_SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
_GEO_BYPASS = False
@@ -607,7 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'setindia',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
'age_limit': 18,
- }
+ },
+ 'skip': 'Private video',
},
{
'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
@@ -663,6 +594,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
+ 'abr': 129.495,
},
'params': {
'youtube_include_dash_manifest': True,
@@ -681,7 +613,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_id': 'TheAmazingAtheist',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
'title': 'Burning Everyone\'s Koran',
- 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
+ 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms \r\n\r\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
},
# Normal age-gate video (No vevo, embed allowed), available via embed page
@@ -717,11 +649,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20100430',
'uploader_id': 'deadmau5',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
- 'creator': 'Dada Life, deadmau5',
- 'description': 'md5:12c56784b8032162bb936a5f76d55360',
+ 'creator': 'deadmau5',
+ 'description': 'md5:6cbcd3a92ce1bc676fc4d6ab4ace2336',
'uploader': 'deadmau5',
'title': 'Deadmau5 - Some Chords (HD)',
- 'alt_title': 'This Machine Kills Some Chords',
+ 'alt_title': 'Some Chords',
},
'expected_warnings': [
'DASH manifest missing',
@@ -816,69 +748,64 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
{
# Multifeed videos (multiple cameras), URL is for Main Camera
- 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs',
+ 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg',
'info_dict': {
- 'id': 'jqWvoWXjCVs',
- 'title': 'teamPGP: Rocket League Noob Stream',
- 'description': 'md5:dc7872fb300e143831327f1bae3af010',
+ 'id': 'jvGDaLqkpTg',
+ 'title': 'Tom Clancy Free Weekend Rainbow Whatever',
+ 'description': 'md5:e03b909557865076822aa169218d6a5d',
},
'playlist': [{
'info_dict': {
- 'id': 'jqWvoWXjCVs',
+ 'id': 'jvGDaLqkpTg',
'ext': 'mp4',
- 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)',
- 'description': 'md5:dc7872fb300e143831327f1bae3af010',
- 'duration': 7335,
- 'upload_date': '20150721',
- 'uploader': 'Beer Games Beer',
- 'uploader_id': 'beergamesbeer',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
- 'license': 'Standard YouTube License',
+ 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)',
+ 'description': 'md5:e03b909557865076822aa169218d6a5d',
+ 'duration': 10643,
+ 'upload_date': '20161111',
+ 'uploader': 'Team PGP',
+ 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
},
}, {
'info_dict': {
- 'id': '6h8e8xoXJzg',
+ 'id': '3AKt1R1aDnw',
'ext': 'mp4',
- 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)',
- 'description': 'md5:dc7872fb300e143831327f1bae3af010',
- 'duration': 7337,
- 'upload_date': '20150721',
- 'uploader': 'Beer Games Beer',
- 'uploader_id': 'beergamesbeer',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
- 'license': 'Standard YouTube License',
+ 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)',
+ 'description': 'md5:e03b909557865076822aa169218d6a5d',
+ 'duration': 10991,
+ 'upload_date': '20161111',
+ 'uploader': 'Team PGP',
+ 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
},
}, {
'info_dict': {
- 'id': 'PUOgX5z9xZw',
+ 'id': 'RtAMM00gpVc',
'ext': 'mp4',
- 'title': 'teamPGP: Rocket League Noob Stream (grizzle)',
- 'description': 'md5:dc7872fb300e143831327f1bae3af010',
- 'duration': 7337,
- 'upload_date': '20150721',
- 'uploader': 'Beer Games Beer',
- 'uploader_id': 'beergamesbeer',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
- 'license': 'Standard YouTube License',
+ 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)',
+ 'description': 'md5:e03b909557865076822aa169218d6a5d',
+ 'duration': 10995,
+ 'upload_date': '20161111',
+ 'uploader': 'Team PGP',
+ 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
},
}, {
'info_dict': {
- 'id': 'teuwxikvS5k',
+ 'id': '6N2fdlP3C5U',
'ext': 'mp4',
- 'title': 'teamPGP: Rocket League Noob Stream (zim)',
- 'description': 'md5:dc7872fb300e143831327f1bae3af010',
- 'duration': 7334,
- 'upload_date': '20150721',
- 'uploader': 'Beer Games Beer',
- 'uploader_id': 'beergamesbeer',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/beergamesbeer',
- 'license': 'Standard YouTube License',
+ 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)',
+ 'description': 'md5:e03b909557865076822aa169218d6a5d',
+ 'duration': 10990,
+ 'upload_date': '20161111',
+ 'uploader': 'Team PGP',
+ 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg',
},
}],
'params': {
'skip_download': True,
},
- 'skip': 'This video is not available.',
},
{
# Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536)
@@ -947,6 +874,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip': 'This video does not exist.',
},
{
+ # Video with incomplete 'yt:stretch=16:'
+ 'url': 'https://www.youtube.com/watch?v=FRhJzUSJbGI',
+ 'only_matching': True,
+ },
+ {
# Video licensed under Creative Commons
'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA',
'info_dict': {
@@ -972,7 +904,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'eQcmzGIKrzg',
'ext': 'mp4',
'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders',
- 'description': 'md5:dda0d780d5a6e120758d1711d062a867',
+ 'description': 'md5:13a2503d7b5904ef4b223aa101628f39',
'duration': 4060,
'upload_date': '20151119',
'uploader': 'Bernie Sanders',
@@ -1019,7 +951,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'id': 'iqKdEhx-dD4',
'ext': 'mp4',
'title': 'Isolation - Mind Field (Ep 1)',
- 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
+ 'description': 'md5:f540112edec5d09fc8cc752d3d4ba3cd',
'duration': 2085,
'upload_date': '20170118',
'uploader': 'Vsauce',
@@ -1054,6 +986,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'This video has been removed for violating YouTube\'s policy on hate speech.',
},
{
# itag 212
@@ -1070,6 +1003,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'only_matching': True,
},
{
+ 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
+ 'only_matching': True,
+ },
+ {
+ # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
+ 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
+ 'only_matching': True,
+ },
+ {
# DRM protected
'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
'only_matching': True,
@@ -1139,6 +1081,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'Video unavailable',
},
{
# empty description results in an empty string
@@ -1184,27 +1127,171 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtube.com/watch_popup?v=63RmMXCd_bQ',
'only_matching': True,
},
+ {
+ # https://github.com/ytdl-org/youtube-dl/pull/28094
+ 'url': 'OtqTfy26tG0',
+ 'info_dict': {
+ 'id': 'OtqTfy26tG0',
+ 'ext': 'mp4',
+ 'title': 'Burn Out',
+ 'description': 'md5:8d07b84dcbcbfb34bc12a56d968b6131',
+ 'upload_date': '20141120',
+ 'uploader': 'The Cinematic Orchestra - Topic',
+ 'uploader_id': 'UCIzsJBIyo8hhpFm1NK0uLgw',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCIzsJBIyo8hhpFm1NK0uLgw',
+ 'artist': 'The Cinematic Orchestra',
+ 'track': 'Burn Out',
+ 'album': 'Every Day',
+ 'release_data': None,
+ 'release_year': None,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # controversial video, only works with bpctr when authenticated with cookies
+ 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
+ 'only_matching': True,
+ },
+ {
+ # restricted location, https://github.com/ytdl-org/youtube-dl/issues/28685
+ 'url': 'cBvYw8_A0vQ',
+ 'info_dict': {
+ 'id': 'cBvYw8_A0vQ',
+ 'ext': 'mp4',
+ 'title': '4K Ueno Okachimachi Street Scenes 上野御徒町歩き',
+ 'description': 'md5:ea770e474b7cd6722b4c95b833c03630',
+ 'upload_date': '20201120',
+ 'uploader': 'Walk around Japan',
+ 'uploader_id': 'UC3o_t8PzBmXf5S9b7GLx1Mw',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UC3o_t8PzBmXf5S9b7GLx1Mw',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
+ _formats = {
+ '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+ '6': {'ext': 'flv', 'width': 450, 'height': 270, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'},
+ '13': {'ext': '3gp', 'acodec': 'aac', 'vcodec': 'mp4v'},
+ '17': {'ext': '3gp', 'width': 176, 'height': 144, 'acodec': 'aac', 'abr': 24, 'vcodec': 'mp4v'},
+ '18': {'ext': 'mp4', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 96, 'vcodec': 'h264'},
+ '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well
+ '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'},
+ '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'},
+ '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+ '44': {'ext': 'webm', 'width': 854, 'height': 480, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'},
+ '45': {'ext': 'webm', 'width': 1280, 'height': 720, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+ '46': {'ext': 'webm', 'width': 1920, 'height': 1080, 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8'},
+ '59': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
+ '78': {'ext': 'mp4', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'},
- def __init__(self, *args, **kwargs):
- super(YoutubeIE, self).__init__(*args, **kwargs)
- self._player_cache = {}
- def report_video_info_webpage_download(self, video_id):
- """Report attempt to download video info webpage."""
- self.to_screen('%s: Downloading video info webpage' % video_id)
+ # 3D videos
+ '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+ '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -20},
+ '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+ '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264', 'preference': -20},
+ '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8', 'preference': -20},
+ '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+ '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
+
+ # Apple HTTP Live Streaming
+ '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+ '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
+ '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+ '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 256, 'vcodec': 'h264', 'preference': -10},
+ '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
+ '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
+
+ # DASH mp4 video
+ '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559)
+ '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
+
+ # Dash mp4 audio
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+ '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+ '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
+
+ # Dash webm
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+ '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
+ '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+
+ # Dash webm audio
+ '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+ '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
- def report_information_extraction(self, video_id):
- """Report attempt to extract video information."""
- self.to_screen('%s: Extracting video information' % video_id)
+ # Dash webm audio with opus inside
+ '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+ '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+ '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
- def report_unavailable_format(self, video_id, format):
- """Report extracted video URL."""
- self.to_screen('%s: Format %s not available' % (video_id, format))
+ # RTMP (unnamed)
+ '_rtmp': {'protocol': 'rtmp'},
- def report_rtmp_download(self):
- """Indicate the download will use the RTMP protocol."""
- self.to_screen('RTMP download detected')
+ # av01 video only formats sometimes served with "unknown" codecs
+ '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+ '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+ '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+ '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'},
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ # Hack for lazy extractors until more generic solution is implemented
+ # (see #28780)
+ from .youtube import parse_qs
+ qs = parse_qs(url)
+ if qs.get('list', [None])[0]:
+ return False
+ return super(YoutubeIE, cls).suitable(url)
+
+ def __init__(self, *args, **kwargs):
+ super(YoutubeIE, self).__init__(*args, **kwargs)
+ self._code_cache = {}
+ self._player_cache = {}
def _signature_cache_id(self, example_sig):
""" Return a string representation of a signature """
@@ -1218,40 +1305,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break
else:
raise ExtractorError('Cannot identify player %r' % player_url)
- return id_m.group('ext'), id_m.group('id')
+ return id_m.group('id')
def _extract_signature_function(self, video_id, player_url, example_sig):
- player_type, player_id = self._extract_player_info(player_url)
+ player_id = self._extract_player_info(player_url)
# Read from filesystem cache
- func_id = '%s_%s_%s' % (
- player_type, player_id, self._signature_cache_id(example_sig))
+ func_id = 'js_%s_%s' % (
+ player_id, self._signature_cache_id(example_sig))
assert os.path.basename(func_id) == func_id
cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
- download_note = (
- 'Downloading player %s' % player_url
- if self._downloader.params.get('verbose') else
- 'Downloading %s player %s' % (player_type, player_id)
- )
- if player_type == 'js':
- code = self._download_webpage(
- player_url, video_id,
- note=download_note,
- errnote='Download of %s failed' % player_url)
- res = self._parse_sig_js(code)
- elif player_type == 'swf':
- urlh = self._request_webpage(
+ if player_id not in self._code_cache:
+ self._code_cache[player_id] = self._download_webpage(
player_url, video_id,
- note=download_note,
+ note='Downloading player ' + player_id,
errnote='Download of %s failed' % player_url)
- code = urlh.read()
- res = self._parse_sig_swf(code)
- else:
- assert False, 'Invalid player type %r' % player_type
+ code = self._code_cache[player_id]
+ res = self._parse_sig_js(code)
test_string = ''.join(map(compat_chr, range(len(example_sig))))
cache_res = res(test_string)
@@ -1303,7 +1377,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
funcname = self._search_regex(
(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+ r'\bm=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(h\.s\)\)',
+ r'\bc&&\(c=(?P<sig>[a-zA-Z0-9$]{2,})\(decodeURIComponent\(c\)\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\);[a-zA-Z0-9$]{2}\.[a-zA-Z0-9$]{2}\(a,\d+\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2,})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns
r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
@@ -1320,14 +1397,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
initial_function = jsi.extract_function(funcname)
return lambda s: initial_function([s])
- def _parse_sig_swf(self, file_contents):
- swfi = SWFInterpreter(file_contents)
- TARGET_CLASSNAME = 'SignatureDecipher'
- searched_class = swfi.extract_class(TARGET_CLASSNAME)
- initial_function = swfi.extract_function(searched_class, 'decipher')
- return lambda s: initial_function([s])
-
- def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
+ def _decrypt_signature(self, s, video_id, player_url):
"""Turn the encrypted s field into a working signature"""
if player_url is None:
@@ -1354,158 +1424,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e)
- def _get_subtitles(self, video_id, webpage):
- try:
- subs_doc = self._download_xml(
- 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
- video_id, note=False)
- except ExtractorError as err:
- self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
- return {}
-
- sub_lang_list = {}
- for track in subs_doc.findall('track'):
- lang = track.attrib['lang_code']
- if lang in sub_lang_list:
- continue
- sub_formats = []
- for ext in self._SUBTITLE_FORMATS:
- params = compat_urllib_parse_urlencode({
- 'lang': lang,
- 'v': video_id,
- 'fmt': ext,
- 'name': track.attrib['name'].encode('utf-8'),
- })
- sub_formats.append({
- 'url': 'https://www.youtube.com/api/timedtext?' + params,
- 'ext': ext,
- })
- sub_lang_list[lang] = sub_formats
- if not sub_lang_list:
- self._downloader.report_warning('video doesn\'t have subtitles')
- return {}
- return sub_lang_list
-
- def _get_ytplayer_config(self, video_id, webpage):
- patterns = (
- # User data may contain arbitrary character sequences that may affect
- # JSON extraction with regex, e.g. when '};' is contained the second
- # regex won't capture the whole JSON. Yet working around by trying more
- # concrete regex first keeping in mind proper quoted string handling
- # to be implemented in future that will replace this workaround (see
- # https://github.com/ytdl-org/youtube-dl/issues/7468,
- # https://github.com/ytdl-org/youtube-dl/pull/7599)
- r';ytplayer\.config\s*=\s*({.+?});ytplayer',
- r';ytplayer\.config\s*=\s*({.+?});',
- )
- config = self._search_regex(
- patterns, webpage, 'ytplayer.config', default=None)
- if config:
- return self._parse_json(
- uppercase_escape(config), video_id, fatal=False)
-
- def _get_automatic_captions(self, video_id, player_response, player_config):
- """We need the webpage for getting the captions url, pass it as an
- argument to speed up the process."""
- self.to_screen('%s: Looking for automatic captions' % video_id)
- err_msg = 'Couldn\'t find automatic captions for %s' % video_id
- if not (player_response or player_config):
- self._downloader.report_warning(err_msg)
- return {}
- try:
- args = player_config.get('args') if player_config else {}
- caption_url = args.get('ttsurl')
- if caption_url:
- timestamp = args['timestamp']
- # We get the available subtitles
- list_params = compat_urllib_parse_urlencode({
- 'type': 'list',
- 'tlangs': 1,
- 'asrs': 1,
- })
- list_url = caption_url + '&' + list_params
- caption_list = self._download_xml(list_url, video_id)
- original_lang_node = caption_list.find('track')
- if original_lang_node is None:
- self._downloader.report_warning('Video doesn\'t have automatic captions')
- return {}
- original_lang = original_lang_node.attrib['lang_code']
- caption_kind = original_lang_node.attrib.get('kind', '')
-
- sub_lang_list = {}
- for lang_node in caption_list.findall('target'):
- sub_lang = lang_node.attrib['lang_code']
- sub_formats = []
- for ext in self._SUBTITLE_FORMATS:
- params = compat_urllib_parse_urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': ext,
- 'ts': timestamp,
- 'kind': caption_kind,
- })
- sub_formats.append({
- 'url': caption_url + '&' + params,
- 'ext': ext,
- })
- sub_lang_list[sub_lang] = sub_formats
- return sub_lang_list
-
- def make_captions(sub_url, sub_langs):
- parsed_sub_url = compat_urllib_parse_urlparse(sub_url)
- caption_qs = compat_parse_qs(parsed_sub_url.query)
- captions = {}
- for sub_lang in sub_langs:
- sub_formats = []
- for ext in self._SUBTITLE_FORMATS:
- caption_qs.update({
- 'tlang': [sub_lang],
- 'fmt': [ext],
- })
- sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace(
- query=compat_urllib_parse_urlencode(caption_qs, True)))
- sub_formats.append({
- 'url': sub_url,
- 'ext': ext,
- })
- captions[sub_lang] = sub_formats
- return captions
-
- # New captions format as of 22.06.2017
- if player_response:
- renderer = player_response['captions']['playerCaptionsTracklistRenderer']
- base_url = renderer['captionTracks'][0]['baseUrl']
- sub_lang_list = []
- for lang in renderer['translationLanguages']:
- lang_code = lang.get('languageCode')
- if lang_code:
- sub_lang_list.append(lang_code)
- return make_captions(base_url, sub_lang_list)
-
- # Some videos don't provide ttsurl but rather caption_tracks and
- # caption_translation_languages (e.g. 20LmZk1hakA)
- # Does not used anymore as of 22.06.2017
- caption_tracks = args['caption_tracks']
- caption_translation_languages = args['caption_translation_languages']
- caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
- sub_lang_list = []
- for lang in caption_translation_languages.split(','):
- lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
- sub_lang = lang_qs.get('lc', [None])[0]
- if sub_lang:
- sub_lang_list.append(sub_lang)
- return make_captions(caption_url, sub_lang_list)
- # An extractor error can be raise by the download process if there are
- # no automatic captions but there are subtitles
- except (KeyError, IndexError, ExtractorError):
- self._downloader.report_warning(err_msg)
- return {}
-
- def _mark_watched(self, video_id, video_info, player_response):
+ def _mark_watched(self, video_id, player_response):
playback_url = url_or_none(try_get(
player_response,
- lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
- video_info, lambda x: x['videostats_playback_base_url'][0]))
+ lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']))
if not playback_url:
return
parsed_playback_url = compat_urlparse.urlparse(playback_url)
@@ -1572,12 +1494,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2)
return video_id
- def _extract_chapters_from_json(self, webpage, video_id, duration):
- if not webpage:
- return
- data = self._extract_yt_initial_data(video_id, webpage)
- if not data or not isinstance(data, dict):
- return
+ def _extract_chapters_from_json(self, data, video_id, duration):
chapters_list = try_get(
data,
lambda x: x['playerOverlays']
@@ -1617,244 +1534,89 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
})
return chapters
- @staticmethod
- def _extract_chapters_from_description(description, duration):
- if not description:
- return None
- chapter_lines = re.findall(
- r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
- description)
- if not chapter_lines:
- return None
- chapters = []
- for next_num, (chapter_line, time_point) in enumerate(
- chapter_lines, start=1):
- start_time = parse_duration(time_point)
- if start_time is None:
- continue
- if start_time > duration:
- break
- end_time = (duration if next_num == len(chapter_lines)
- else parse_duration(chapter_lines[next_num][1]))
- if end_time is None:
- continue
- if end_time > duration:
- end_time = duration
- if start_time > end_time:
- break
- chapter_title = re.sub(
- r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
- chapter_title = re.sub(r'\s+', ' ', chapter_title)
- chapters.append({
- 'start_time': start_time,
- 'end_time': end_time,
- 'title': chapter_title,
- })
- return chapters
-
- def _extract_chapters(self, webpage, description, video_id, duration):
- return (self._extract_chapters_from_json(webpage, video_id, duration)
- or self._extract_chapters_from_description(description, duration))
+ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
+ return self._parse_json(self._search_regex(
+ (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
+ regex), webpage, name, default='{}'), video_id, fatal=False)
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
+ video_id = self._match_id(url)
+ base_url = self.http_scheme() + '//www.youtube.com/'
+ webpage_url = base_url + 'watch?v=' + video_id
+ webpage = self._download_webpage(
+ webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False)
+
+ player_response = None
+ if webpage:
+ player_response = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE,
+ video_id, 'initial player response')
+ if not player_response:
+ player_response = self._call_api(
+ 'player', {'videoId': video_id}, video_id)
+
+ playability_status = player_response.get('playabilityStatus') or {}
+ if playability_status.get('reason') == 'Sign in to confirm your age':
+ video_info = self._download_webpage(
+ base_url + 'get_video_info', video_id,
+ 'Refetching age-gated info webpage',
+ 'unable to download video info webpage', query={
+ 'video_id': video_id,
+ 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+ 'html5': 1,
+ # See https://github.com/ytdl-org/youtube-dl/issues/29333#issuecomment-864049544
+ 'c': 'TVHTML5',
+ 'cver': '6.20180913',
+ }, fatal=False)
+ if video_info:
+ pr = self._parse_json(
+ try_get(
+ compat_parse_qs(video_info),
+ lambda x: x['player_response'][0], compat_str) or '{}',
+ video_id, fatal=False)
+ if pr and isinstance(pr, dict):
+ player_response = pr
+
+ trailer_video_id = try_get(
+ playability_status,
+ lambda x: x['errorScreen']['playerLegacyDesktopYpcTrailerRenderer']['trailerVideoId'],
+ compat_str)
+ if trailer_video_id:
+ return self.url_result(
+ trailer_video_id, self.ie_key(), trailer_video_id)
- proto = (
- 'http' if self._downloader.params.get('prefer_insecure', False)
- else 'https')
-
- start_time = None
- end_time = None
- parsed_url = compat_urllib_parse_urlparse(url)
- for component in [parsed_url.fragment, parsed_url.query]:
- query = compat_parse_qs(component)
- if start_time is None and 't' in query:
- start_time = parse_duration(query['t'][0])
- if start_time is None and 'start' in query:
- start_time = parse_duration(query['start'][0])
- if end_time is None and 'end' in query:
- end_time = parse_duration(query['end'][0])
-
- # Extract original video URL from URL with redirection, like age verification, using next_url parameter
- mobj = re.search(self._NEXT_URL_RE, url)
- if mobj:
- url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
- video_id = self.extract_id(url)
-
- # Get video webpage
- url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
- video_webpage, urlh = self._download_webpage_handle(url, video_id)
-
- qs = compat_parse_qs(compat_urllib_parse_urlparse(urlh.geturl()).query)
- video_id = qs.get('v', [None])[0] or video_id
-
- # Attempt to extract SWF player URL
- mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
- if mobj is not None:
- player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
- else:
- player_url = None
-
- dash_mpds = []
-
- def add_dash_mpd(video_info):
- dash_mpd = video_info.get('dashmpd')
- if dash_mpd and dash_mpd[0] not in dash_mpds:
- dash_mpds.append(dash_mpd[0])
-
- def add_dash_mpd_pr(pl_response):
- dash_mpd = url_or_none(try_get(
- pl_response, lambda x: x['streamingData']['dashManifestUrl'],
- compat_str))
- if dash_mpd and dash_mpd not in dash_mpds:
- dash_mpds.append(dash_mpd)
-
- is_live = None
- view_count = None
-
- def extract_view_count(v_info):
- return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
-
- def extract_player_response(player_response, video_id):
- pl_response = str_or_none(player_response)
- if not pl_response:
+ def get_text(x):
+ if not x:
return
- pl_response = self._parse_json(pl_response, video_id, fatal=False)
- if isinstance(pl_response, dict):
- add_dash_mpd_pr(pl_response)
- return pl_response
-
- player_response = {}
-
- # Get video info
- video_info = {}
- embed_webpage = None
- ytplayer_config = None
-
- if re.search(r'["\']status["\']\s*:\s*["\']LOGIN_REQUIRED', video_webpage) is not None:
- age_gate = True
- # We simulate the access to the video from www.youtube.com/v/{video_id}
- # this can be viewed without login into Youtube
- url = proto + '://www.youtube.com/embed/%s' % video_id
- embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
- data = compat_urllib_parse_urlencode({
- 'video_id': video_id,
- 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
- 'sts': self._search_regex(
- r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
- })
- video_info_url = proto + '://www.youtube.com/get_video_info?' + data
- try:
- video_info_webpage = self._download_webpage(
- video_info_url, video_id,
- note='Refetching age-gated info webpage',
- errnote='unable to download video info webpage')
- except ExtractorError:
- video_info_webpage = None
- if video_info_webpage:
- video_info = compat_parse_qs(video_info_webpage)
- pl_response = video_info.get('player_response', [None])[0]
- player_response = extract_player_response(pl_response, video_id)
- add_dash_mpd(video_info)
- view_count = extract_view_count(video_info)
- else:
- age_gate = False
- # Try looking directly into the video webpage
- ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
- if ytplayer_config:
- args = ytplayer_config['args']
- if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
- # Convert to the same format returned by compat_parse_qs
- video_info = dict((k, [v]) for k, v in args.items())
- add_dash_mpd(video_info)
- # Rental video is not rented but preview is available (e.g.
- # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
- # https://github.com/ytdl-org/youtube-dl/issues/10532)
- if not video_info and args.get('ypc_vid'):
- return self.url_result(
- args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
- if args.get('livestream') == '1' or args.get('live_playback') == 1:
- is_live = True
- if not player_response:
- player_response = extract_player_response(args.get('player_response'), video_id)
- if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
- add_dash_mpd_pr(player_response)
-
- if not video_info and not player_response:
- player_response = extract_player_response(
- self._search_regex(
- (r'%s\s*%s' % (self._YT_INITIAL_PLAYER_RESPONSE_RE, self._YT_INITIAL_BOUNDARY_RE),
- self._YT_INITIAL_PLAYER_RESPONSE_RE), video_webpage,
- 'initial player response', default='{}'),
- video_id)
-
- def extract_unavailable_message():
- messages = []
- for tag, kind in (('h1', 'message'), ('div', 'submessage')):
- msg = self._html_search_regex(
- r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
- video_webpage, 'unavailable %s' % kind, default=None)
- if msg:
- messages.append(msg)
- if messages:
- return '\n'.join(messages)
-
- if not video_info and not player_response:
- unavailable_message = extract_unavailable_message()
- if not unavailable_message:
- unavailable_message = 'Unable to extract video data'
- raise ExtractorError(
- 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
-
- if not isinstance(video_info, dict):
- video_info = {}
+ text = x.get('simpleText')
+ if text and isinstance(text, compat_str):
+ return text
+ runs = x.get('runs')
+ if not isinstance(runs, list):
+ return
+ return ''.join([r['text'] for r in runs if isinstance(r.get('text'), compat_str)])
- video_details = try_get(
- player_response, lambda x: x['videoDetails'], dict) or {}
+ search_meta = (
+ lambda x: self._html_search_meta(x, webpage, default=None)) \
+ if webpage else lambda x: None
+ video_details = player_response.get('videoDetails') or {}
microformat = try_get(
- player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
-
- video_title = video_info.get('title', [None])[0] or video_details.get('title')
- if not video_title:
- self._downloader.report_warning('Unable to extract video title')
- video_title = '_'
-
- description_original = video_description = get_element_by_id("eow-description", video_webpage)
- if video_description:
-
- def replace_url(m):
- redir_url = compat_urlparse.urljoin(url, m.group(1))
- parsed_redir_url = compat_urllib_parse_urlparse(redir_url)
- if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect':
- qs = compat_parse_qs(parsed_redir_url.query)
- q = qs.get('q')
- if q and q[0]:
- return q[0]
- return redir_url
-
- description_original = video_description = re.sub(r'''(?x)
- <a\s+
- (?:[a-zA-Z-]+="[^"]*"\s+)*?
- (?:title|href)="([^"]+)"\s+
- (?:[a-zA-Z-]+="[^"]*"\s+)*?
- class="[^"]*"[^>]*>
- [^<]+\.{3}\s*
- </a>
- ''', replace_url, video_description)
- video_description = clean_html(video_description)
- else:
- video_description = video_details.get('shortDescription')
- if video_description is None:
- video_description = self._html_search_meta('description', video_webpage)
+ player_response,
+ lambda x: x['microformat']['playerMicroformatRenderer'],
+ dict) or {}
+ video_title = video_details.get('title') \
+ or get_text(microformat.get('title')) \
+ or search_meta(['og:title', 'twitter:title', 'title'])
+ video_description = video_details.get('shortDescription')
if not smuggled_data.get('force_singlefeed', False):
if not self._downloader.params.get('noplaylist'):
multifeed_metadata_list = try_get(
player_response,
lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
- compat_str) or try_get(
- video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
+ compat_str)
if multifeed_metadata_list:
entries = []
feed_ids = []
@@ -1862,10 +1624,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Unquote should take place before split on comma (,) since textual
# fields may contain comma as well (see
# https://github.com/ytdl-org/youtube-dl/issues/8536)
- feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
+ feed_data = compat_parse_qs(
+ compat_urllib_parse_unquote_plus(feed))
def feed_entry(name):
- return try_get(feed_data, lambda x: x[name][0], compat_str)
+ return try_get(
+ feed_data, lambda x: x[name][0], compat_str)
feed_id = feed_entry('id')
if not feed_id:
@@ -1878,7 +1642,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'_type': 'url_transparent',
'ie_key': 'Youtube',
'url': smuggle_url(
- '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+ base_url + 'watch?v=' + feed_data['id'][0],
{'force_singlefeed': True}),
'title': title,
})
@@ -1886,631 +1650,416 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.to_screen(
'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
% (', '.join(feed_ids), video_id))
- return self.playlist_result(entries, video_id, video_title, video_description)
+ return self.playlist_result(
+ entries, video_id, video_title, video_description)
else:
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- if view_count is None:
- view_count = extract_view_count(video_info)
- if view_count is None and video_details:
- view_count = int_or_none(video_details.get('viewCount'))
- if view_count is None and microformat:
- view_count = int_or_none(microformat.get('viewCount'))
-
- if is_live is None:
- is_live = bool_or_none(video_details.get('isLive'))
-
- # Check for "rental" videos
- if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
- raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True)
-
- def _extract_filesize(media_url):
- return int_or_none(self._search_regex(
- r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
-
- streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
- streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
-
- if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
- self.report_rtmp_download()
- formats = [{
- 'format_id': '_rtmp',
- 'protocol': 'rtmp',
- 'url': video_info['conn'][0],
- 'player_url': player_url,
- }]
- elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
- encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
- if 'rtmpe%3Dyes' in encoded_url_map:
- raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
- formats = []
- formats_spec = {}
- fmt_list = video_info.get('fmt_list', [''])[0]
- if fmt_list:
- for fmt in fmt_list.split(','):
- spec = fmt.split('/')
- if len(spec) > 1:
- width_height = spec[1].split('x')
- if len(width_height) == 2:
- formats_spec[spec[0]] = {
- 'resolution': spec[1],
- 'width': int_or_none(width_height[0]),
- 'height': int_or_none(width_height[1]),
- }
- for fmt in streaming_formats:
- itag = str_or_none(fmt.get('itag'))
- if not itag:
- continue
- quality = fmt.get('quality')
- quality_label = fmt.get('qualityLabel') or quality
- formats_spec[itag] = {
- 'asr': int_or_none(fmt.get('audioSampleRate')),
- 'filesize': int_or_none(fmt.get('contentLength')),
- 'format_note': quality_label,
- 'fps': int_or_none(fmt.get('fps')),
- 'height': int_or_none(fmt.get('height')),
- # bitrate for itag 43 is always 2147483647
- 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
- 'width': int_or_none(fmt.get('width')),
- }
+ formats = []
+ itags = []
+ itag_qualities = {}
+ player_url = None
+ q = qualities(['tiny', 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres'])
+ streaming_data = player_response.get('streamingData') or {}
+ streaming_formats = streaming_data.get('formats') or []
+ streaming_formats.extend(streaming_data.get('adaptiveFormats') or [])
+ for fmt in streaming_formats:
+ if fmt.get('targetDurationSec') or fmt.get('drmFamilies'):
+ continue
- for fmt in streaming_formats:
- if fmt.get('drmFamilies') or fmt.get('drm_families'):
- continue
- url = url_or_none(fmt.get('url'))
+ itag = str_or_none(fmt.get('itag'))
+ quality = fmt.get('quality')
+ if itag and quality:
+ itag_qualities[itag] = quality
+ # FORMAT_STREAM_TYPE_OTF(otf=1) requires downloading the init fragment
+ # (adding `&sq=0` to the URL) and parsing emsg box to determine the
+ # number of fragment that would subsequently requested with (`&sq=N`)
+ if fmt.get('type') == 'FORMAT_STREAM_TYPE_OTF':
+ continue
- if not url:
- cipher = fmt.get('cipher') or fmt.get('signatureCipher')
- if not cipher:
+ fmt_url = fmt.get('url')
+ if not fmt_url:
+ sc = compat_parse_qs(fmt.get('signatureCipher'))
+ fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0]))
+ encrypted_sig = try_get(sc, lambda x: x['s'][0])
+ if not (sc and fmt_url and encrypted_sig):
+ continue
+ if not player_url:
+ if not webpage:
continue
- url_data = compat_parse_qs(cipher)
- url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
- if not url:
+ player_url = self._search_regex(
+ r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
+ webpage, 'player URL', fatal=False)
+ if not player_url:
+ continue
+ signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
+ sp = try_get(sc, lambda x: x['sp'][0]) or 'signature'
+ fmt_url += '&' + sp + '=' + signature
+
+ if itag:
+ itags.append(itag)
+ tbr = float_or_none(
+ fmt.get('averageBitrate') or fmt.get('bitrate'), 1000)
+ dct = {
+ 'asr': int_or_none(fmt.get('audioSampleRate')),
+ 'filesize': int_or_none(fmt.get('contentLength')),
+ 'format_id': itag,
+ 'format_note': fmt.get('qualityLabel') or quality,
+ 'fps': int_or_none(fmt.get('fps')),
+ 'height': int_or_none(fmt.get('height')),
+ 'quality': q(quality),
+ 'tbr': tbr,
+ 'url': fmt_url,
+ 'width': fmt.get('width'),
+ }
+ mimetype = fmt.get('mimeType')
+ if mimetype:
+ mobj = re.match(
+ r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', mimetype)
+ if mobj:
+ dct['ext'] = mimetype2ext(mobj.group(1))
+ dct.update(parse_codecs(mobj.group(2)))
+ no_audio = dct.get('acodec') == 'none'
+ no_video = dct.get('vcodec') == 'none'
+ if no_audio:
+ dct['vbr'] = tbr
+ if no_video:
+ dct['abr'] = tbr
+ if no_audio or no_video:
+ dct['downloader_options'] = {
+ # Youtube throttles chunks >~10M
+ 'http_chunk_size': 10485760,
+ }
+ if dct.get('ext'):
+ dct['container'] = dct['ext'] + '_dash'
+ formats.append(dct)
+
+ hls_manifest_url = streaming_data.get('hlsManifestUrl')
+ if hls_manifest_url:
+ for f in self._extract_m3u8_formats(
+ hls_manifest_url, video_id, 'mp4', fatal=False):
+ itag = self._search_regex(
+ r'/itag/(\d+)', f['url'], 'itag', default=None)
+ if itag:
+ f['format_id'] = itag
+ formats.append(f)
+
+ if self._downloader.params.get('youtube_include_dash_manifest', True):
+ dash_manifest_url = streaming_data.get('dashManifestUrl')
+ if dash_manifest_url:
+ for f in self._extract_mpd_formats(
+ dash_manifest_url, video_id, fatal=False):
+ itag = f['format_id']
+ if itag in itags:
continue
- else:
- cipher = None
- url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ if itag in itag_qualities:
+ f['quality'] = q(itag_qualities[itag])
+ filesize = int_or_none(self._search_regex(
+ r'/clen/(\d+)', f.get('fragment_base_url')
+ or f['url'], 'file size', default=None))
+ if filesize:
+ f['filesize'] = filesize
+ formats.append(f)
- stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
- # Unsupported FORMAT_STREAM_TYPE_OTF
- if stream_type == 3:
- continue
+ if not formats:
+ if streaming_data.get('licenseInfos'):
+ raise ExtractorError(
+ 'This video is DRM protected.', expected=True)
+ pemr = try_get(
+ playability_status,
+ lambda x: x['errorScreen']['playerErrorMessageRenderer'],
+ dict) or {}
+ reason = get_text(pemr.get('reason')) or playability_status.get('reason')
+ subreason = pemr.get('subreason')
+ if subreason:
+ subreason = clean_html(get_text(subreason))
+ if subreason == 'The uploader has not made this video available in your country.':
+ countries = microformat.get('availableCountries')
+ if not countries:
+ regions_allowed = search_meta('regionsAllowed')
+ countries = regions_allowed.split(',') if regions_allowed else None
+ self.raise_geo_restricted(
+ subreason, countries)
+ reason += '\n' + subreason
+ if reason:
+ raise ExtractorError(reason, expected=True)
- format_id = fmt.get('itag') or url_data['itag'][0]
- if not format_id:
- continue
- format_id = compat_str(format_id)
-
- if cipher:
- if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
- ASSETS_RE = (
- r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
- r'"jsUrl"\s*:\s*("[^"]+")',
- r'"assets":.+?"js":\s*("[^"]+")')
- jsplayer_url_json = self._search_regex(
- ASSETS_RE,
- embed_webpage if age_gate else video_webpage,
- 'JS player URL (1)', default=None)
- if not jsplayer_url_json and not age_gate:
- # We need the embed website after all
- if embed_webpage is None:
- embed_url = proto + '://www.youtube.com/embed/%s' % video_id
- embed_webpage = self._download_webpage(
- embed_url, video_id, 'Downloading embed webpage')
- jsplayer_url_json = self._search_regex(
- ASSETS_RE, embed_webpage, 'JS player URL')
-
- player_url = json.loads(jsplayer_url_json)
- if player_url is None:
- player_url_json = self._search_regex(
- r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
- video_webpage, 'age gate player URL')
- player_url = json.loads(player_url_json)
-
- if 'sig' in url_data:
- url += '&signature=' + url_data['sig'][0]
- elif 's' in url_data:
- encrypted_sig = url_data['s'][0]
-
- if self._downloader.params.get('verbose'):
- if player_url is None:
- player_desc = 'unknown'
- else:
- player_type, player_version = self._extract_player_info(player_url)
- player_desc = '%s player %s' % ('flash' if player_type == 'swf' else 'html5', player_version)
- parts_sizes = self._signature_cache_id(encrypted_sig)
- self.to_screen('{%s} signature length %s, %s' %
- (format_id, parts_sizes, player_desc))
-
- signature = self._decrypt_signature(
- encrypted_sig, video_id, player_url, age_gate)
- sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
- url += '&%s=%s' % (sp, signature)
- if 'ratebypass' not in url:
- url += '&ratebypass=yes'
-
- dct = {
- 'format_id': format_id,
- 'url': url,
- 'player_url': player_url,
- }
- if format_id in self._formats:
- dct.update(self._formats[format_id])
- if format_id in formats_spec:
- dct.update(formats_spec[format_id])
-
- # Some itags are not included in DASH manifest thus corresponding formats will
- # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993).
- # Trying to extract metadata from url_encoded_fmt_stream_map entry.
- mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
- width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
-
- if width is None:
- width = int_or_none(fmt.get('width'))
- if height is None:
- height = int_or_none(fmt.get('height'))
-
- filesize = int_or_none(url_data.get(
- 'clen', [None])[0]) or _extract_filesize(url)
-
- quality = url_data.get('quality', [None])[0] or fmt.get('quality')
- quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
-
- tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000)
- or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None
- fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
-
- more_fields = {
- 'filesize': filesize,
- 'tbr': tbr,
- 'width': width,
- 'height': height,
- 'fps': fps,
- 'format_note': quality_label or quality,
- }
- for key, value in more_fields.items():
- if value:
- dct[key] = value
- type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
- if type_:
- type_split = type_.split(';')
- kind_ext = type_split[0].split('/')
- if len(kind_ext) == 2:
- kind, _ = kind_ext
- dct['ext'] = mimetype2ext(type_split[0])
- if kind in ('audio', 'video'):
- codecs = None
- for mobj in re.finditer(
- r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
- if mobj.group('key') == 'codecs':
- codecs = mobj.group('val')
- break
- if codecs:
- dct.update(parse_codecs(codecs))
- if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
- dct['downloader_options'] = {
- # Youtube throttles chunks >~10M
- 'http_chunk_size': 10485760,
- }
- formats.append(dct)
- else:
- manifest_url = (
- url_or_none(try_get(
- player_response,
- lambda x: x['streamingData']['hlsManifestUrl'],
- compat_str))
- or url_or_none(try_get(
- video_info, lambda x: x['hlsvp'][0], compat_str)))
- if manifest_url:
- formats = []
- m3u8_formats = self._extract_m3u8_formats(
- manifest_url, video_id, 'mp4', fatal=False)
- for a_format in m3u8_formats:
- itag = self._search_regex(
- r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
- if itag:
- a_format['format_id'] = itag
- if itag in self._formats:
- dct = self._formats[itag].copy()
- dct.update(a_format)
- a_format = dct
- a_format['player_url'] = player_url
- # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
- a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
- formats.append(a_format)
- else:
- error_message = extract_unavailable_message()
- if not error_message:
- reason_list = try_get(
- player_response,
- lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
- list) or []
- for reason in reason_list:
- if not isinstance(reason, dict):
- continue
- reason_text = try_get(reason, lambda x: x['text'], compat_str)
- if reason_text:
- if not error_message:
- error_message = ''
- error_message += reason_text
- if error_message:
- error_message = clean_html(error_message)
- if not error_message:
- error_message = clean_html(try_get(
- player_response, lambda x: x['playabilityStatus']['reason'],
- compat_str))
- if not error_message:
- error_message = clean_html(
- try_get(video_info, lambda x: x['reason'][0], compat_str))
- if error_message:
- raise ExtractorError(error_message, expected=True)
- raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
-
- # uploader
- video_uploader = try_get(
- video_info, lambda x: x['author'][0],
- compat_str) or str_or_none(video_details.get('author'))
- if video_uploader:
- video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
- else:
- self._downloader.report_warning('unable to extract uploader name')
-
- # uploader_id
- video_uploader_id = None
- video_uploader_url = None
- mobj = re.search(
- r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
- video_webpage)
- if mobj is not None:
- video_uploader_id = mobj.group('uploader_id')
- video_uploader_url = mobj.group('uploader_url')
- else:
- owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
- if owner_profile_url:
- video_uploader_id = self._search_regex(
- r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
- default=None)
- video_uploader_url = owner_profile_url
-
- channel_id = (
- str_or_none(video_details.get('channelId'))
- or self._html_search_meta(
- 'channelId', video_webpage, 'channel id', default=None)
- or self._search_regex(
- r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
- video_webpage, 'channel id', default=None, group='id'))
- channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
+ self._sort_formats(formats)
- thumbnails = []
- thumbnails_list = try_get(
- video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
- for t in thumbnails_list:
- if not isinstance(t, dict):
- continue
- thumbnail_url = url_or_none(t.get('url'))
- if not thumbnail_url:
- continue
- thumbnails.append({
- 'url': thumbnail_url,
- 'width': int_or_none(t.get('width')),
- 'height': int_or_none(t.get('height')),
- })
+ keywords = video_details.get('keywords') or []
+ if not keywords and webpage:
+ keywords = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), webpage)]
+ for keyword in keywords:
+ if keyword.startswith('yt:stretch='):
+ mobj = re.search(r'(\d+)\s*:\s*(\d+)', keyword)
+ if mobj:
+ # NB: float is intentional for forcing float division
+ w, h = (float(v) for v in mobj.groups())
+ if w > 0 and h > 0:
+ ratio = w / h
+ for f in formats:
+ if f.get('vcodec') != 'none':
+ f['stretched_ratio'] = ratio
+ break
- if not thumbnails:
- video_thumbnail = None
- # We try first to get a high quality image:
- m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
- video_webpage, re.DOTALL)
- if m_thumb is not None:
- video_thumbnail = m_thumb.group(1)
- thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
- if thumbnail_url:
- video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
- if video_thumbnail:
- thumbnails.append({'url': video_thumbnail})
-
- # upload date
- upload_date = self._html_search_meta(
- 'datePublished', video_webpage, 'upload date', default=None)
- if not upload_date:
- upload_date = self._search_regex(
- [r'(?s)id="eow-date.*?>(.*?)</span>',
- r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
- video_webpage, 'upload date', default=None)
- if not upload_date:
- upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
- upload_date = unified_strdate(upload_date)
-
- video_license = self._html_search_regex(
- r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
- video_webpage, 'license', default=None)
-
- m_music = re.search(
- r'''(?x)
- <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
- <ul[^>]*>\s*
- <li>(?P<title>.+?)
- by (?P<creator>.+?)
- (?:
- \(.+?\)|
- <a[^>]*
- (?:
- \bhref=["\']/red[^>]*>| # drop possible
- >\s*Listen ad-free with YouTube Red # YouTube Red ad
- )
- .*?
- )?</li
- ''',
- video_webpage)
- if m_music:
- video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
- video_creator = clean_html(m_music.group('creator'))
+ thumbnails = []
+ for container in (video_details, microformat):
+ for thumbnail in (try_get(
+ container,
+ lambda x: x['thumbnail']['thumbnails'], list) or []):
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'height': int_or_none(thumbnail.get('height')),
+ 'url': thumbnail_url,
+ 'width': int_or_none(thumbnail.get('width')),
+ })
+ if thumbnails:
+ break
else:
- video_alt_title = video_creator = None
+ thumbnail = search_meta(['og:image', 'twitter:image'])
+ if thumbnail:
+ thumbnails = [{'url': thumbnail}]
+
+ category = microformat.get('category') or search_meta('genre')
+ channel_id = video_details.get('channelId') \
+ or microformat.get('externalChannelId') \
+ or search_meta('channelId')
+ duration = int_or_none(
+ video_details.get('lengthSeconds')
+ or microformat.get('lengthSeconds')) \
+ or parse_duration(search_meta('duration'))
+ is_live = video_details.get('isLive')
+ owner_profile_url = microformat.get('ownerProfileUrl')
+
+ info = {
+ 'id': video_id,
+ 'title': self._live_title(video_title) if is_live else video_title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': video_description,
+ 'upload_date': unified_strdate(
+ microformat.get('uploadDate')
+ or search_meta('uploadDate')),
+ 'uploader': video_details['author'],
+ 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None,
+ 'uploader_url': owner_profile_url,
+ 'channel_id': channel_id,
+ 'channel_url': 'https://www.youtube.com/channel/' + channel_id if channel_id else None,
+ 'duration': duration,
+ 'view_count': int_or_none(
+ video_details.get('viewCount')
+ or microformat.get('viewCount')
+ or search_meta('interactionCount')),
+ 'average_rating': float_or_none(video_details.get('averageRating')),
+ 'age_limit': 18 if (
+ microformat.get('isFamilySafe') is False
+ or search_meta('isFamilyFriendly') == 'false'
+ or search_meta('og:restrictions:age') == '18+') else 0,
+ 'webpage_url': webpage_url,
+ 'categories': [category] if category else None,
+ 'tags': keywords,
+ 'is_live': is_live,
+ }
- def extract_meta(field):
- return self._html_search_regex(
- r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
- video_webpage, field, default=None)
+ pctr = try_get(
+ player_response,
+ lambda x: x['captions']['playerCaptionsTracklistRenderer'], dict)
+ if pctr:
+ def process_language(container, base_url, lang_code, query):
+ lang_subs = []
+ for fmt in self._SUBTITLE_FORMATS:
+ query.update({
+ 'fmt': fmt,
+ })
+ lang_subs.append({
+ 'ext': fmt,
+ 'url': update_url_query(base_url, query),
+ })
+ container[lang_code] = lang_subs
+
+ subtitles = {}
+ for caption_track in (pctr.get('captionTracks') or []):
+ base_url = caption_track.get('baseUrl')
+ if not base_url:
+ continue
+ if caption_track.get('kind') != 'asr':
+ lang_code = caption_track.get('languageCode')
+ if not lang_code:
+ continue
+ process_language(
+ subtitles, base_url, lang_code, {})
+ continue
+ automatic_captions = {}
+ for translation_language in (pctr.get('translationLanguages') or []):
+ translation_language_code = translation_language.get('languageCode')
+ if not translation_language_code:
+ continue
+ process_language(
+ automatic_captions, base_url, translation_language_code,
+ {'tlang': translation_language_code})
+ info['automatic_captions'] = automatic_captions
+ info['subtitles'] = subtitles
- track = extract_meta('Song')
- artist = extract_meta('Artist')
- album = extract_meta('Album')
+ parsed_url = compat_urllib_parse_urlparse(url)
+ for component in [parsed_url.fragment, parsed_url.query]:
+ query = compat_parse_qs(component)
+ for k, v in query.items():
+ for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]:
+ d_k += '_time'
+ if d_k not in info and k in s_ks:
+ info[d_k] = parse_duration(query[k][0])
- # Youtube Music Auto-generated description
- release_date = release_year = None
if video_description:
mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
if mobj:
- if not track:
- track = mobj.group('track').strip()
- if not artist:
- artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
- if not album:
- album = mobj.group('album'.strip())
release_year = mobj.group('release_year')
release_date = mobj.group('release_date')
if release_date:
release_date = release_date.replace('-', '')
if not release_year:
- release_year = int(release_date[:4])
- if release_year:
- release_year = int(release_year)
+ release_year = release_date[:4]
+ info.update({
+ 'album': mobj.group('album'.strip()),
+ 'artist': mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')),
+ 'track': mobj.group('track').strip(),
+ 'release_date': release_date,
+ 'release_year': int_or_none(release_year),
+ })
- yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
- contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
- for content in contents:
- rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
- multiple_songs = False
- for row in rows:
- if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
- multiple_songs = True
- break
- for row in rows:
- mrr = row.get('metadataRowRenderer') or {}
- mrr_title = try_get(
- mrr, lambda x: x['title']['simpleText'], compat_str)
- mrr_contents = try_get(
- mrr, lambda x: x['contents'][0], dict) or {}
- mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
- if not (mrr_title and mrr_contents_text):
- continue
- if mrr_title == 'License':
- video_license = mrr_contents_text
- elif not multiple_songs:
- if mrr_title == 'Album':
- album = mrr_contents_text
- elif mrr_title == 'Artist':
- artist = mrr_contents_text
- elif mrr_title == 'Song':
- track = mrr_contents_text
-
- m_episode = re.search(
- r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
- video_webpage)
- if m_episode:
- series = unescapeHTML(m_episode.group('series'))
- season_number = int(m_episode.group('season'))
- episode_number = int(m_episode.group('episode'))
- else:
- series = season_number = episode_number = None
-
- m_cat_container = self._search_regex(
- r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
- video_webpage, 'categories', default=None)
- category = None
- if m_cat_container:
- category = self._html_search_regex(
- r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
- default=None)
- if not category:
- category = try_get(
- microformat, lambda x: x['category'], compat_str)
- video_categories = None if category is None else [category]
-
- video_tags = [
- unescapeHTML(m.group('content'))
- for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
- if not video_tags:
- video_tags = try_get(video_details, lambda x: x['keywords'], list)
-
- def _extract_count(count_name):
- return str_to_int(self._search_regex(
- (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
- r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
- video_webpage, count_name, default=None))
-
- like_count = _extract_count('like')
- dislike_count = _extract_count('dislike')
-
- if view_count is None:
- view_count = str_to_int(self._search_regex(
- r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
- 'view count', default=None))
-
- average_rating = (
- float_or_none(video_details.get('averageRating'))
- or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
-
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, video_webpage)
- automatic_captions = self.extract_automatic_captions(video_id, player_response, ytplayer_config)
-
- video_duration = try_get(
- video_info, lambda x: int_or_none(x['length_seconds'][0]))
- if not video_duration:
- video_duration = int_or_none(video_details.get('lengthSeconds'))
- if not video_duration:
- video_duration = parse_duration(self._html_search_meta(
- 'duration', video_webpage, 'video duration'))
-
- # annotations
- video_annotations = None
- if self._downloader.params.get('writeannotations', False):
- xsrf_token = None
- ytcfg = self._extract_ytcfg(video_id, video_webpage)
- if ytcfg:
- xsrf_token = try_get(ytcfg, lambda x: x['XSRF_TOKEN'], compat_str)
- if not xsrf_token:
- xsrf_token = self._search_regex(
- r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>(?:(?!\2).)+)\2',
- video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
- invideo_url = try_get(
- player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
- if xsrf_token and invideo_url:
- xsrf_field_name = None
- if ytcfg:
- xsrf_field_name = try_get(ytcfg, lambda x: x['XSRF_FIELD_NAME'], compat_str)
- if not xsrf_field_name:
- xsrf_field_name = self._search_regex(
- r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
- video_webpage, 'xsrf field name',
- group='xsrf_field_name', default='session_token')
- video_annotations = self._download_webpage(
- self._proto_relative_url(invideo_url),
- video_id, note='Downloading annotations',
- errnote='Unable to download video annotations', fatal=False,
- data=urlencode_postdata({xsrf_field_name: xsrf_token}))
-
- chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
-
- # Look for the DASH manifest
- if self._downloader.params.get('youtube_include_dash_manifest', True):
- dash_mpd_fatal = True
- for mpd_url in dash_mpds:
- dash_formats = {}
- try:
- def decrypt_sig(mobj):
- s = mobj.group(1)
- dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
- return '/signature/%s' % dec_s
-
- mpd_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, mpd_url)
-
- for df in self._extract_mpd_formats(
- mpd_url, video_id, fatal=dash_mpd_fatal,
- formats_dict=self._formats):
- if not df.get('filesize'):
- df['filesize'] = _extract_filesize(df['url'])
- # Do not overwrite DASH format found in some previous DASH manifest
- if df['format_id'] not in dash_formats:
- dash_formats[df['format_id']] = df
- # Additional DASH manifests may end up in HTTP Error 403 therefore
- # allow them to fail without bug report message if we already have
- # some DASH manifest succeeded. This is temporary workaround to reduce
- # burst of bug reports until we figure out the reason and whether it
- # can be fixed at all.
- dash_mpd_fatal = False
- except (ExtractorError, KeyError) as e:
- self.report_warning(
- 'Skipping DASH manifest: %r' % e, video_id)
- if dash_formats:
- # Remove the formats we found through non-DASH, they
- # contain less info and it can be wrong, because we use
- # fixed values (for example the resolution). See
- # https://github.com/ytdl-org/youtube-dl/issues/5774 for an
- # example.
- formats = [f for f in formats if f['format_id'] not in dash_formats.keys()]
- formats.extend(dash_formats.values())
-
- # Check for malformed aspect ratio
- stretched_m = re.search(
- r'<meta\s+property="og:video:tag".*?content="yt:stretch=(?P<w>[0-9]+):(?P<h>[0-9]+)">',
- video_webpage)
- if stretched_m:
- w = float(stretched_m.group('w'))
- h = float(stretched_m.group('h'))
- # yt:stretch may hold invalid ratio data (e.g. for Q39EVAstoRM ratio is 17:0).
- # We will only process correct ratios.
- if w > 0 and h > 0:
- ratio = w / h
- for f in formats:
- if f.get('vcodec') != 'none':
- f['stretched_ratio'] = ratio
+ initial_data = None
+ if webpage:
+ initial_data = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_DATA_RE, video_id,
+ 'yt initial data')
+ if not initial_data:
+ initial_data = self._call_api(
+ 'next', {'videoId': video_id}, video_id, fatal=False)
+
+ if initial_data:
+ chapters = self._extract_chapters_from_json(
+ initial_data, video_id, duration)
+ if not chapters:
+ for engagment_pannel in (initial_data.get('engagementPanels') or []):
+ contents = try_get(
+ engagment_pannel, lambda x: x['engagementPanelSectionListRenderer']['content']['macroMarkersListRenderer']['contents'],
+ list)
+ if not contents:
+ continue
- if not formats:
- if 'reason' in video_info:
- if 'The uploader has not made this video available in your country.' in video_info['reason']:
- regions_allowed = self._html_search_meta(
- 'regionsAllowed', video_webpage, default=None)
- countries = regions_allowed.split(',') if regions_allowed else None
- self.raise_geo_restricted(
- msg=video_info['reason'][0], countries=countries)
- reason = video_info['reason'][0]
- if 'Invalid parameters' in reason:
- unavailable_message = extract_unavailable_message()
- if unavailable_message:
- reason = unavailable_message
- raise ExtractorError(
- 'YouTube said: %s' % reason,
- expected=True, video_id=video_id)
- if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ def chapter_time(mmlir):
+ return parse_duration(
+ get_text(mmlir.get('timeDescription')))
+
+ chapters = []
+ for next_num, content in enumerate(contents, start=1):
+ mmlir = content.get('macroMarkersListItemRenderer') or {}
+ start_time = chapter_time(mmlir)
+ end_time = chapter_time(try_get(
+ contents, lambda x: x[next_num]['macroMarkersListItemRenderer'])) \
+ if next_num < len(contents) else duration
+ if start_time is None or end_time is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': get_text(mmlir.get('title')),
+ })
+ if chapters:
+ break
+ if chapters:
+ info['chapters'] = chapters
+
+ contents = try_get(
+ initial_data,
+ lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'],
+ list) or []
+ for content in contents:
+ vpir = content.get('videoPrimaryInfoRenderer')
+ if vpir:
+ stl = vpir.get('superTitleLink')
+ if stl:
+ stl = get_text(stl)
+ if try_get(
+ vpir,
+ lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN':
+ info['location'] = stl
+ else:
+ mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl)
+ if mobj:
+ info.update({
+ 'series': mobj.group(1),
+ 'season_number': int(mobj.group(2)),
+ 'episode_number': int(mobj.group(3)),
+ })
+ for tlb in (try_get(
+ vpir,
+ lambda x: x['videoActions']['menuRenderer']['topLevelButtons'],
+ list) or []):
+ tbr = tlb.get('toggleButtonRenderer') or {}
+ for getter, regex in [(
+ lambda x: x['defaultText']['accessibility']['accessibilityData'],
+ r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([
+ lambda x: x['accessibility'],
+ lambda x: x['accessibilityData']['accessibilityData'],
+ ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]:
+ label = (try_get(tbr, getter, dict) or {}).get('label')
+ if label:
+ mobj = re.match(regex, label)
+ if mobj:
+ info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count'))
+ break
+ sbr_tooltip = try_get(
+ vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip'])
+ if sbr_tooltip:
+ like_count, dislike_count = sbr_tooltip.split(' / ')
+ info.update({
+ 'like_count': str_to_int(like_count),
+ 'dislike_count': str_to_int(dislike_count),
+ })
+ vsir = content.get('videoSecondaryInfoRenderer')
+ if vsir:
+ info['channel'] = get_text(try_get(
+ vsir,
+ lambda x: x['owner']['videoOwnerRenderer']['title'],
+ dict))
+ rows = try_get(
+ vsir,
+ lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows'],
+ list) or []
+ multiple_songs = False
+ for row in rows:
+ if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
+ multiple_songs = True
+ break
+ for row in rows:
+ mrr = row.get('metadataRowRenderer') or {}
+ mrr_title = mrr.get('title')
+ if not mrr_title:
+ continue
+ mrr_title = get_text(mrr['title'])
+ mrr_contents_text = get_text(mrr['contents'][0])
+ if mrr_title == 'License':
+ info['license'] = mrr_contents_text
+ elif not multiple_songs:
+ if mrr_title == 'Album':
+ info['album'] = mrr_contents_text
+ elif mrr_title == 'Artist':
+ info['artist'] = mrr_contents_text
+ elif mrr_title == 'Song':
+ info['track'] = mrr_contents_text
- self._sort_formats(formats)
+ for s_k, d_k in [('artist', 'creator'), ('track', 'alt_title')]:
+ v = info.get(s_k)
+ if v:
+ info[d_k] = v
- self.mark_watched(video_id, video_info, player_response)
+ self.mark_watched(video_id, player_response)
- return {
- 'id': video_id,
- 'uploader': video_uploader,
- 'uploader_id': video_uploader_id,
- 'uploader_url': video_uploader_url,
- 'channel_id': channel_id,
- 'channel_url': channel_url,
- 'upload_date': upload_date,
- 'license': video_license,
- 'creator': video_creator or artist,
- 'title': video_title,
- 'alt_title': video_alt_title or track,
- 'thumbnails': thumbnails,
- 'description': video_description,
- 'categories': video_categories,
- 'tags': video_tags,
- 'subtitles': video_subtitles,
- 'automatic_captions': automatic_captions,
- 'duration': video_duration,
- 'age_limit': 18 if age_gate else 0,
- 'annotations': video_annotations,
- 'chapters': chapters,
- 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
- 'view_count': view_count,
- 'like_count': like_count,
- 'dislike_count': dislike_count,
- 'average_rating': average_rating,
- 'formats': formats,
- 'is_live': is_live,
- 'start_time': start_time,
- 'end_time': end_time,
- 'series': series,
- 'season_number': season_number,
- 'episode_number': episode_number,
- 'track': track,
- 'artist': artist,
- 'album': album,
- 'release_date': release_date,
- 'release_year': release_year,
- }
+ return info
class YoutubeTabIE(YoutubeBaseInfoExtractor):
@@ -2523,7 +2072,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
invidio\.us
)/
(?:
- (?:channel|c|user|feed)/|
+ (?:channel|c|user|feed|hashtag)/|
(?:playlist|watch)\?.*?\blist=|
(?!(?:watch|embed|v|e|results)\b)
)
@@ -2550,6 +2099,15 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
},
}, {
+ # playlists, series
+ 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'UCYO_jab_esuFRV4b17AJtAw',
+ 'title': '3Blue1Brown - Playlists',
+ 'description': 'md5:e1384e8a133307dd10edee76e875d62f',
+ },
+ }, {
# playlists, singlepage
'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
'playlist_mincount': 4,
@@ -2809,6 +2367,16 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
}, {
'url': 'https://www.youtube.com/TheYoungTurks/live',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/hashtag/cctv9',
+ 'info_dict': {
+ 'id': 'cctv9',
+ 'title': '#cctv9',
+ },
+ 'playlist_mincount': 350,
+ }, {
+ 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU',
+ 'only_matching': True,
}]
@classmethod
@@ -2831,40 +2399,13 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
@staticmethod
def _extract_grid_item_renderer(item):
- for item_kind in ('Playlist', 'Video', 'Channel'):
- renderer = item.get('grid%sRenderer' % item_kind)
- if renderer:
- return renderer
-
- def _extract_video(self, renderer):
- video_id = renderer.get('videoId')
- title = try_get(
- renderer,
- (lambda x: x['title']['runs'][0]['text'],
- lambda x: x['title']['simpleText']), compat_str)
- description = try_get(
- renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
- compat_str)
- duration = parse_duration(try_get(
- renderer, lambda x: x['lengthText']['simpleText'], compat_str))
- view_count_text = try_get(
- renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
- view_count = str_to_int(self._search_regex(
- r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
- 'view count', default=None))
- uploader = try_get(
- renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
- return {
- '_type': 'url_transparent',
- 'ie_key': YoutubeIE.ie_key(),
- 'id': video_id,
- 'url': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'view_count': view_count,
- 'uploader': uploader,
- }
+ assert isinstance(item, dict)
+ for key, renderer in item.items():
+ if not key.startswith('grid') or not key.endswith('Renderer'):
+ continue
+ if not isinstance(renderer, dict):
+ continue
+ return renderer
def _grid_entries(self, grid_renderer):
for item in grid_renderer['items']:
@@ -2874,7 +2415,8 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if not isinstance(renderer, dict):
continue
title = try_get(
- renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ renderer, (lambda x: x['title']['runs'][0]['text'],
+ lambda x: x['title']['simpleText']), compat_str)
# playlist
playlist_id = renderer.get('playlistId')
if playlist_id:
@@ -2882,10 +2424,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'https://www.youtube.com/playlist?list=%s' % playlist_id,
ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
video_title=title)
+ continue
# video
video_id = renderer.get('videoId')
if video_id:
yield self._extract_video(renderer)
+ continue
# channel
channel_id = renderer.get('channelId')
if channel_id:
@@ -2894,6 +2438,17 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
yield self.url_result(
'https://www.youtube.com/channel/%s' % channel_id,
ie=YoutubeTabIE.ie_key(), video_title=title)
+ continue
+ # generic endpoint URL support
+ ep_url = urljoin('https://www.youtube.com/', try_get(
+ renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'],
+ compat_str))
+ if ep_url:
+ for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE):
+ if ie.suitable(ep_url):
+ yield self.url_result(
+ ep_url, ie=ie.ie_key(), video_id=ie._match_id(ep_url), video_title=title)
+ break
def _shelf_entries_from_content(self, shelf_renderer):
content = shelf_renderer.get('content')
@@ -2986,6 +2541,14 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
for entry in self._post_thread_entries(renderer):
yield entry
+ def _rich_grid_entries(self, contents):
+ for content in contents:
+ video_renderer = try_get(content, lambda x: x['richItemRenderer']['content']['videoRenderer'], dict)
+ if video_renderer:
+ entry = self._video_entry(video_renderer)
+ if entry:
+ yield entry
+
@staticmethod
def _build_continuation_query(continuation, ctp=None):
query = {
@@ -3013,9 +2576,9 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
next_continuation = cls._extract_next_continuation_data(renderer)
if next_continuation:
return next_continuation
- contents = renderer.get('contents')
- if not isinstance(contents, list):
- return
+ contents = []
+ for key in ('contents', 'items'):
+ contents.extend(try_get(renderer, lambda x: x[key], list) or [])
for content in contents:
if not isinstance(content, dict):
continue
@@ -3031,82 +2594,111 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
ctp = continuation_ep.get('clickTrackingParams')
return YoutubeTabIE._build_continuation_query(continuation, ctp)
- def _entries(self, tab, identity_token):
+ def _entries(self, tab, item_id, webpage):
tab_content = try_get(tab, lambda x: x['content'], dict)
if not tab_content:
return
slr_renderer = try_get(tab_content, lambda x: x['sectionListRenderer'], dict)
- if not slr_renderer:
- return
- is_channels_tab = tab.get('title') == 'Channels'
- continuation = None
- slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []
- for slr_content in slr_contents:
- if not isinstance(slr_content, dict):
- continue
- is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
- if not is_renderer:
- continue
- isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
- for isr_content in isr_contents:
- if not isinstance(isr_content, dict):
- continue
- renderer = isr_content.get('playlistVideoListRenderer')
- if renderer:
- for entry in self._playlist_entries(renderer):
- yield entry
- continuation = self._extract_continuation(renderer)
- continue
- renderer = isr_content.get('gridRenderer')
- if renderer:
- for entry in self._grid_entries(renderer):
- yield entry
- continuation = self._extract_continuation(renderer)
+ if slr_renderer:
+ is_channels_tab = tab.get('title') == 'Channels'
+ continuation = None
+ slr_contents = try_get(slr_renderer, lambda x: x['contents'], list) or []
+ for slr_content in slr_contents:
+ if not isinstance(slr_content, dict):
continue
- renderer = isr_content.get('shelfRenderer')
- if renderer:
- for entry in self._shelf_entries(renderer, not is_channels_tab):
- yield entry
+ is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
+ if not is_renderer:
continue
- renderer = isr_content.get('backstagePostThreadRenderer')
- if renderer:
- for entry in self._post_thread_entries(renderer):
- yield entry
- continuation = self._extract_continuation(renderer)
- continue
- renderer = isr_content.get('videoRenderer')
- if renderer:
- entry = self._video_entry(renderer)
- if entry:
- yield entry
-
+ isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
+ for isr_content in isr_contents:
+ if not isinstance(isr_content, dict):
+ continue
+ renderer = isr_content.get('playlistVideoListRenderer')
+ if renderer:
+ for entry in self._playlist_entries(renderer):
+ yield entry
+ continuation = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('gridRenderer')
+ if renderer:
+ for entry in self._grid_entries(renderer):
+ yield entry
+ continuation = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('shelfRenderer')
+ if renderer:
+ for entry in self._shelf_entries(renderer, not is_channels_tab):
+ yield entry
+ continue
+ renderer = isr_content.get('backstagePostThreadRenderer')
+ if renderer:
+ for entry in self._post_thread_entries(renderer):
+ yield entry
+ continuation = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('videoRenderer')
+ if renderer:
+ entry = self._video_entry(renderer)
+ if entry:
+ yield entry
+
+ if not continuation:
+ continuation = self._extract_continuation(is_renderer)
if not continuation:
- continuation = self._extract_continuation(is_renderer)
+ continuation = self._extract_continuation(slr_renderer)
+ else:
+ rich_grid_renderer = tab_content.get('richGridRenderer')
+ if not rich_grid_renderer:
+ return
+ for entry in self._rich_grid_entries(rich_grid_renderer.get('contents') or []):
+ yield entry
+ continuation = self._extract_continuation(rich_grid_renderer)
- if not continuation:
- continuation = self._extract_continuation(slr_renderer)
+ ytcfg = self._extract_ytcfg(item_id, webpage)
+ client_version = try_get(
+ ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00'
headers = {
'x-youtube-client-name': '1',
- 'x-youtube-client-version': '2.20201112.04.01',
+ 'x-youtube-client-version': client_version,
+ 'content-type': 'application/json',
}
+
+ context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': client_version,
+ }
+ }
+ visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
+
+ identity_token = self._extract_identity_token(ytcfg, webpage)
if identity_token:
headers['x-youtube-identity-token'] = identity_token
+ data = {
+ 'context': context,
+ }
+
for page_num in itertools.count(1):
if not continuation:
break
+ if visitor_data:
+ headers['x-goog-visitor-id'] = visitor_data
+ data['continuation'] = continuation['continuation']
+ data['clickTracking'] = {
+ 'clickTrackingParams': continuation['itct']
+ }
count = 0
retries = 3
while count <= retries:
try:
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
- browse = self._download_json(
- 'https://www.youtube.com/browse_ajax', None,
- 'Downloading page %d%s'
- % (page_num, ' (retry #%d)' % count if count else ''),
- headers=headers, query=continuation)
+ response = self._download_json(
+ 'https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8',
+ None, 'Downloading page %d%s' % (page_num, ' (retry #%d)' % count if count else ''),
+ headers=headers, data=json.dumps(data).encode('utf8'))
break
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
@@ -3114,12 +2706,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if count <= retries:
continue
raise
- if not browse:
- break
- response = try_get(browse, lambda x: x[1]['response'], dict)
if not response:
break
+ visitor_data = try_get(
+ response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
+
continuation_contents = try_get(
response, lambda x: x['continuationContents'], dict)
if continuation_contents:
@@ -3142,12 +2734,20 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continuation = self._extract_continuation(continuation_renderer)
continue
+ on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints'))
continuation_items = try_get(
- response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
+ on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list)
if continuation_items:
continuation_item = continuation_items[0]
if not isinstance(continuation_item, dict):
continue
+ renderer = self._extract_grid_item_renderer(continuation_item)
+ if renderer:
+ grid_renderer = {'items': continuation_items}
+ for entry in self._grid_entries(grid_renderer):
+ yield entry
+ continuation = self._extract_continuation(grid_renderer)
+ continue
renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
if renderer:
video_list_renderer = {'contents': continuation_items}
@@ -3155,6 +2755,19 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
yield entry
continuation = self._extract_continuation(video_list_renderer)
continue
+ renderer = continuation_item.get('backstagePostThreadRenderer')
+ if renderer:
+ continuation_renderer = {'contents': continuation_items}
+ for entry in self._post_thread_continuation_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
+ renderer = continuation_item.get('richItemRenderer')
+ if renderer:
+ for entry in self._rich_grid_entries(continuation_items):
+ yield entry
+ continuation = self._extract_continuation({'contents': continuation_items})
+ continue
break
@@ -3207,11 +2820,12 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
alerts.append(text)
return '\n'.join(alerts)
- def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
+ def _extract_from_tabs(self, item_id, webpage, data, tabs):
selected_tab = self._extract_selected_tab(tabs)
renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
- playlist_id = title = description = None
+ playlist_id = item_id
+ title = description = None
if renderer:
channel_title = renderer.get('title') or item_id
tab_title = selected_tab.get('title')
@@ -3220,14 +2834,18 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
title += ' - %s' % tab_title
description = renderer.get('description')
playlist_id = renderer.get('externalId')
- renderer = try_get(
- data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
- if renderer:
- title = renderer.get('title')
- description = None
- playlist_id = item_id
+ else:
+ renderer = try_get(
+ data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
+ if renderer:
+ title = renderer.get('title')
+ else:
+ renderer = try_get(
+ data, lambda x: x['header']['hashtagHeaderRenderer'], dict)
+ if renderer:
+ title = try_get(renderer, lambda x: x['hashtag']['simpleText'])
playlist = self.playlist_result(
- self._entries(selected_tab, identity_token),
+ self._entries(selected_tab, item_id, webpage),
playlist_id=playlist_id, playlist_title=title,
playlist_description=description)
playlist.update(self._extract_uploader(data))
@@ -3251,8 +2869,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
self._playlist_entries(playlist), playlist_id=playlist_id,
playlist_title=title)
- def _extract_identity_token(self, webpage, item_id):
- ytcfg = self._extract_ytcfg(item_id, webpage)
+ def _extract_identity_token(self, ytcfg, webpage):
if ytcfg:
token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
if token:
@@ -3266,7 +2883,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
url = compat_urlparse.urlunparse(
compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
# Handle both video/playlist URLs
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
video_id = qs.get('v', [None])[0]
playlist_id = qs.get('list', [None])[0]
if video_id and playlist_id:
@@ -3275,12 +2892,11 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
webpage = self._download_webpage(url, item_id)
- identity_token = self._extract_identity_token(webpage, item_id)
data = self._extract_yt_initial_data(item_id, webpage)
tabs = try_get(
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
if tabs:
- return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
+ return self._extract_from_tabs(item_id, webpage, data, tabs)
playlist = try_get(
data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
if playlist:
@@ -3363,12 +2979,19 @@ class YoutubePlaylistIE(InfoExtractor):
@classmethod
def suitable(cls, url):
- return False if YoutubeTabIE.suitable(url) else super(
- YoutubePlaylistIE, cls).suitable(url)
+ if YoutubeTabIE.suitable(url):
+ return False
+ # Hack for lazy extractors until more generic solution is implemented
+ # (see #28780)
+ from .youtube import parse_qs
+ qs = parse_qs(url)
+ if qs.get('v', [None])[0]:
+ return False
+ return super(YoutubePlaylistIE, cls).suitable(url)
def _real_extract(self, url):
playlist_id = self._match_id(url)
- qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ qs = parse_qs(url)
if not qs:
qs = {'list': playlist_id}
return self.url_result(
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py
index 5ed2946c2..4dd56f66d 100644
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -7,7 +7,9 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
+ float_or_none,
int_or_none,
+ merge_dicts,
NO_DEFAULT,
orderedSet,
parse_codecs,
@@ -21,49 +23,17 @@ from ..utils import (
class ZDFBaseIE(InfoExtractor):
- def _call_api(self, url, player, referrer, video_id, item):
- return self._download_json(
- url, video_id, 'Downloading JSON %s' % item,
- headers={
- 'Referer': referrer,
- 'Api-Auth': 'Bearer %s' % player['apiToken'],
- })
-
- def _extract_player(self, webpage, video_id, fatal=True):
- return self._parse_json(
- self._search_regex(
- r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage,
- 'player JSON', default='{}' if not fatal else NO_DEFAULT,
- group='json'),
- video_id)
-
-
-class ZDFIE(ZDFBaseIE):
- _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html'
- _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd')
_GEO_COUNTRIES = ['DE']
+ _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd')
- _TESTS = [{
- 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
- 'info_dict': {
- 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100',
- 'ext': 'mp4',
- 'title': 'Die Magie der Farben (2/2)',
- 'description': 'md5:a89da10c928c6235401066b60a6d5c1a',
- 'duration': 2615,
- 'timestamp': 1465021200,
- 'upload_date': '20160604',
- },
- }, {
- 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
- 'only_matching': True,
- }, {
- 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html',
- 'only_matching': True,
- }, {
- 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
- 'only_matching': True,
- }]
+ def _call_api(self, url, video_id, item, api_token=None, referrer=None):
+ headers = {}
+ if api_token:
+ headers['Api-Auth'] = 'Bearer %s' % api_token
+ if referrer:
+ headers['Referer'] = referrer
+ return self._download_json(
+ url, video_id, 'Downloading JSON %s' % item, headers=headers)
@staticmethod
def _extract_subtitles(src):
@@ -109,20 +79,11 @@ class ZDFIE(ZDFBaseIE):
})
formats.append(f)
- def _extract_entry(self, url, player, content, video_id):
- title = content.get('title') or content['teaserHeadline']
-
- t = content['mainVideoContent']['http://zdf.de/rels/target']
-
- ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
-
- if not ptmd_path:
- ptmd_path = t[
- 'http://zdf.de/rels/streams/ptmd-template'].replace(
- '{playerId}', 'ngplayer_2_4')
-
+ def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer):
ptmd = self._call_api(
- urljoin(url, ptmd_path), player, url, video_id, 'metadata')
+ ptmd_url, video_id, 'metadata', api_token, referrer)
+
+ content_id = ptmd.get('basename') or ptmd_url.split('/')[-1]
formats = []
track_uris = set()
@@ -140,7 +101,7 @@ class ZDFIE(ZDFBaseIE):
continue
for track in tracks:
self._extract_format(
- video_id, formats, track_uris, {
+ content_id, formats, track_uris, {
'url': track.get('uri'),
'type': f.get('type'),
'mimeType': f.get('mimeType'),
@@ -149,6 +110,103 @@ class ZDFIE(ZDFBaseIE):
})
self._sort_formats(formats)
+ duration = float_or_none(try_get(
+ ptmd, lambda x: x['attributes']['duration']['value']), scale=1000)
+
+ return {
+ 'extractor_key': ZDFIE.ie_key(),
+ 'id': content_id,
+ 'duration': duration,
+ 'formats': formats,
+ 'subtitles': self._extract_subtitles(ptmd),
+ }
+
+ def _extract_player(self, webpage, video_id, fatal=True):
+ return self._parse_json(
+ self._search_regex(
+ r'(?s)data-zdfplayer-jsb=(["\'])(?P<json>{.+?})\1', webpage,
+ 'player JSON', default='{}' if not fatal else NO_DEFAULT,
+ group='json'),
+ video_id)
+
+
+class ZDFIE(ZDFBaseIE):
+ _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
+ _TESTS = [{
+ # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
+ 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
+ 'md5': '34ec321e7eb34231fd88616c65c92db0',
+ 'info_dict': {
+ 'id': '210222_phx_nachgehakt_corona_protest',
+ 'ext': 'mp4',
+ 'title': 'Wohin führt der Protest in der Pandemie?',
+ 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
+ 'duration': 1691,
+ 'timestamp': 1613948400,
+ 'upload_date': '20210221',
+ },
+ }, {
+ # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
+ 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
+ 'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
+ 'info_dict': {
+ 'id': '141007_ab18_10wochensommer_film',
+ 'ext': 'mp4',
+ 'title': 'Ab 18! - 10 Wochen Sommer',
+ 'description': 'md5:8253f41dc99ce2c3ff892dac2d65fe26',
+ 'duration': 2660,
+ 'timestamp': 1608604200,
+ 'upload_date': '20201222',
+ },
+ }, {
+ 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html',
+ 'info_dict': {
+ 'id': '151025_magie_farben2_tex',
+ 'ext': 'mp4',
+ 'title': 'Die Magie der Farben (2/2)',
+ 'description': 'md5:a89da10c928c6235401066b60a6d5c1a',
+ 'duration': 2615,
+ 'timestamp': 1465021200,
+ 'upload_date': '20160604',
+ },
+ }, {
+ # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
+ 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html',
+ 'only_matching': True,
+ }, {
+ # Same as https://www.3sat.de/film/spielfilm/der-hauptmann-100.html
+ 'url': 'https://www.zdf.de/filme/filme-sonstige/der-hauptmann-112.html',
+ 'only_matching': True,
+ }, {
+ # Same as https://www.3sat.de/wissen/nano/nano-21-mai-2019-102.html, equal media ids
+ 'url': 'https://www.zdf.de/wissen/nano/nano-21-mai-2019-102.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
+ 'only_matching': True,
+ }]
+
+ def _extract_entry(self, url, player, content, video_id):
+ title = content.get('title') or content['teaserHeadline']
+
+ t = content['mainVideoContent']['http://zdf.de/rels/target']
+
+ ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
+
+ if not ptmd_path:
+ ptmd_path = t[
+ 'http://zdf.de/rels/streams/ptmd-template'].replace(
+ '{playerId}', 'ngplayer_2_4')
+
+ info = self._extract_ptmd(
+ urljoin(url, ptmd_path), video_id, player['apiToken'], url)
+
thumbnails = []
layouts = try_get(
content, lambda x: x['teaserImageRef']['layouts'], dict)
@@ -169,33 +227,33 @@ class ZDFIE(ZDFBaseIE):
})
thumbnails.append(thumbnail)
- return {
- 'id': video_id,
+ return merge_dicts(info, {
'title': title,
'description': content.get('leadParagraph') or content.get('teasertext'),
'duration': int_or_none(t.get('duration')),
'timestamp': unified_timestamp(content.get('editorialDate')),
'thumbnails': thumbnails,
- 'subtitles': self._extract_subtitles(ptmd),
- 'formats': formats,
- }
+ })
def _extract_regular(self, url, player, video_id):
content = self._call_api(
- player['content'], player, url, video_id, 'content')
+ player['content'], video_id, 'content', player['apiToken'], url)
return self._extract_entry(player['content'], player, content, video_id)
def _extract_mobile(self, video_id):
- document = self._download_json(
+ video = self._download_json(
'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id,
- video_id)['document']
+ video_id)
+
+ document = video['document']
title = document['titel']
+ content_id = document['basename']
formats = []
format_urls = set()
for f in document['formitaeten']:
- self._extract_format(video_id, formats, format_urls, f)
+ self._extract_format(content_id, formats, format_urls, f)
self._sort_formats(formats)
thumbnails = []
@@ -213,12 +271,12 @@ class ZDFIE(ZDFBaseIE):
})
return {
- 'id': video_id,
+ 'id': content_id,
'title': title,
'description': document.get('beschreibung'),
'duration': int_or_none(document.get('length')),
- 'timestamp': unified_timestamp(try_get(
- document, lambda x: x['meta']['editorialDate'], compat_str)),
+ 'timestamp': unified_timestamp(document.get('date')) or unified_timestamp(
+ try_get(video, lambda x: x['meta']['editorialDate'], compat_str)),
'thumbnails': thumbnails,
'subtitles': self._extract_subtitles(document),
'formats': formats,
diff --git a/youtube_dl/extractor/zhihu.py b/youtube_dl/extractor/zhihu.py
new file mode 100644
index 000000000..d1ed55be3
--- /dev/null
+++ b/youtube_dl/extractor/zhihu.py
@@ -0,0 +1,69 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none, int_or_none
+
+
+class ZhihuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.zhihu.com/zvideo/1342930761977176064',
+ 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464',
+ 'info_dict': {
+ 'id': '1342930761977176064',
+ 'ext': 'mp4',
+ 'title': '写春联也太难了吧!',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'uploader': '桥半舫',
+ 'timestamp': 1612959715,
+ 'upload_date': '20210210',
+ 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365',
+ 'duration': 146.333,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ zvideo = self._download_json(
+ 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id)
+ title = zvideo['title']
+ video = zvideo.get('video') or {}
+
+ formats = []
+ for format_id, q in (video.get('playlist') or {}).items():
+ play_url = q.get('url') or q.get('play_url')
+ if not play_url:
+ continue
+ formats.append({
+ 'asr': int_or_none(q.get('sample_rate')),
+ 'filesize': int_or_none(q.get('size')),
+ 'format_id': format_id,
+ 'fps': int_or_none(q.get('fps')),
+ 'height': int_or_none(q.get('height')),
+ 'tbr': float_or_none(q.get('bitrate')),
+ 'url': play_url,
+ 'width': int_or_none(q.get('width')),
+ })
+ self._sort_formats(formats)
+
+ author = zvideo.get('author') or {}
+ url_token = author.get('url_token')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'),
+ 'uploader': author.get('name'),
+ 'timestamp': int_or_none(zvideo.get('published_at')),
+ 'uploader_id': author.get('id'),
+ 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
+ 'duration': float_or_none(video.get('duration')),
+ 'view_count': int_or_none(zvideo.get('play_count')),
+ 'like_count': int_or_none(zvideo.get('liked_count')),
+ 'comment_count': int_or_none(zvideo.get('comment_count')),
+ }
diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py
index adfdcaabf..207c04f5e 100644
--- a/youtube_dl/extractor/zingmp3.py
+++ b/youtube_dl/extractor/zingmp3.py
@@ -1,93 +1,94 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
- update_url_query,
)
-class ZingMp3BaseInfoExtractor(InfoExtractor):
+class ZingMp3BaseIE(InfoExtractor):
+ _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html'
+ _GEO_COUNTRIES = ['VN']
- def _extract_item(self, item, page_type, fatal=True):
- error_message = item.get('msg')
- if error_message:
- if not fatal:
- return
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error_message),
- expected=True)
+ def _extract_item(self, item, fatal):
+ item_id = item['id']
+ title = item.get('name') or item['title']
formats = []
- for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])):
- if not source_url or source_url == 'require vip':
+ for k, v in (item.get('source') or {}).items():
+ if not v:
continue
- if not re.match(r'https?://', source_url):
- source_url = '//' + source_url
- source_url = self._proto_relative_url(source_url, 'http:')
- quality_num = int_or_none(quality)
- f = {
- 'format_id': quality,
- 'url': source_url,
- }
- if page_type == 'video':
- f.update({
- 'height': quality_num,
- 'ext': 'mp4',
- })
+ if k in ('mp4', 'hls'):
+ for res, video_url in v.items():
+ if not video_url:
+ continue
+ if k == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, item_id, 'mp4',
+ 'm3u8_native', m3u8_id=k, fatal=False))
+ elif k == 'mp4':
+ formats.append({
+ 'format_id': 'mp4-' + res,
+ 'url': video_url,
+ 'height': int_or_none(self._search_regex(
+ r'^(\d+)p', res, 'resolution', default=None)),
+ })
else:
- f.update({
- 'abr': quality_num,
+ formats.append({
'ext': 'mp3',
+ 'format_id': k,
+ 'tbr': int_or_none(k),
+ 'url': self._proto_relative_url(v),
+ 'vcodec': 'none',
})
- formats.append(f)
+ if not formats:
+ if not fatal:
+ return
+ msg = item['msg']
+ if msg == 'Sorry, this content is not available in your country.':
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+ raise ExtractorError(msg, expected=True)
+ self._sort_formats(formats)
+
+ subtitles = None
+ lyric = item.get('lyric')
+ if lyric:
+ subtitles = {
+ 'origin': [{
+ 'url': lyric,
+ }],
+ }
- cover = item.get('cover')
+ album = item.get('album') or {}
return {
- 'title': (item.get('name') or item.get('title')).strip(),
+ 'id': item_id,
+ 'title': title,
'formats': formats,
- 'thumbnail': 'http:/' + cover if cover else None,
- 'artist': item.get('artist'),
+ 'thumbnail': item.get('thumbnail'),
+ 'subtitles': subtitles,
+ 'duration': int_or_none(item.get('duration')),
+ 'track': title,
+ 'artist': item.get('artists_names'),
+ 'album': album.get('name') or album.get('title'),
+ 'album_artist': album.get('artists_names'),
}
- def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None):
- player_json = self._download_json(player_json_url, id, 'Downloading Player JSON')
- items = player_json['data']
- if 'item' in items:
- items = items['item']
-
- if len(items) == 1:
- # one single song
- data = self._extract_item(items[0], page_type)
- data['id'] = id
-
- return data
- else:
- # playlist of songs
- entries = []
-
- for i, item in enumerate(items, 1):
- entry = self._extract_item(item, page_type, fatal=False)
- if not entry:
- continue
- entry['id'] = '%s-%d' % (id, i)
- entries.append(entry)
-
- return {
- '_type': 'playlist',
- 'id': id,
- 'title': playlist_title,
- 'entries': entries,
- }
+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url.replace('://zingmp3.vn/', '://mp3.zing.vn/'),
+ page_id, query={'play_song': 1})
+ data_path = self._search_regex(
+ r'data-xml="([^"]+)', webpage, 'data path')
+ return self._process_data(self._download_json(
+ 'https://mp3.zing.vn/xhr' + data_path, page_id)['data'])
-class ZingMp3IE(ZingMp3BaseInfoExtractor):
- _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html'
+class ZingMp3IE(ZingMp3BaseIE):
+ _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip'
_TESTS = [{
'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
'md5': 'ead7ae13693b3205cbc89536a077daed',
@@ -95,49 +96,66 @@ class ZingMp3IE(ZingMp3BaseInfoExtractor):
'id': 'ZWZB9WAB',
'title': 'Xa Mãi Xa',
'ext': 'mp3',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'subtitles': {
+ 'origin': [{
+ 'ext': 'lrc',
+ }]
+ },
+ 'duration': 255,
+ 'track': 'Xa Mãi Xa',
+ 'artist': 'Bảo Thy',
+ 'album': 'Special Album',
+ 'album_artist': 'Bảo Thy',
},
}, {
- 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html',
- 'md5': '870295a9cd8045c0e15663565902618d',
+ 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html',
+ 'md5': 'e9c972b693aa88301ef981c8151c4343',
'info_dict': {
- 'id': 'ZW6BAEA0',
- 'title': 'Let It Go (Frozen OST)',
+ 'id': 'ZO8ZF7C7',
+ 'title': 'Sương Hoa Đưa Lối',
'ext': 'mp4',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'duration': 207,
+ 'track': 'Sương Hoa Đưa Lối',
+ 'artist': 'K-ICM, RYO',
},
}, {
+ 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
+ 'only_matching': True,
+ }]
+ IE_NAME = 'zingmp3'
+ IE_DESC = 'mp3.zing.vn'
+
+ def _process_data(self, data):
+ return self._extract_item(data, True)
+
+
+class ZingMp3AlbumIE(ZingMp3BaseIE):
+ _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'album|playlist'
+ _TESTS = [{
'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
'info_dict': {
'_type': 'playlist',
'id': 'ZWZBWDAF',
- 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless',
+ 'title': 'Lâu Đài Tình Ái',
},
'playlist_count': 10,
- 'skip': 'removed at the request of the owner',
}, {
'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
'only_matching': True,
+ }, {
+ 'url': 'https://zingmp3.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html',
+ 'only_matching': True,
}]
- IE_NAME = 'zingmp3'
- IE_DESC = 'mp3.zing.vn'
-
- def _real_extract(self, url):
- page_id = self._match_id(url)
-
- webpage = self._download_webpage(url, page_id)
-
- player_json_url = self._search_regex([
- r'data-xml="([^"]+)',
- r'&amp;xmlURL=([^&]+)&'
- ], webpage, 'player xml url')
-
- playlist_title = None
- page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type')
- if page_type == 'video':
- player_json_url = update_url_query(player_json_url, {'format': 'json'})
- else:
- player_json_url = player_json_url.replace('/xml/', '/html5xml/')
- if page_type == 'album':
- playlist_title = self._og_search_title(webpage)
-
- return self._extract_player_json(player_json_url, page_id, page_type, playlist_title)
+ IE_NAME = 'zingmp3:album'
+
+ def _process_data(self, data):
+ def entries():
+ for item in (data.get('items') or []):
+ entry = self._extract_item(item, False)
+ if entry:
+ yield entry
+ info = data.get('info') or {}
+ return self.playlist_result(
+ entries(), info.get('id'), info.get('name') or info.get('title'))
diff --git a/youtube_dl/extractor/zoom.py b/youtube_dl/extractor/zoom.py
new file mode 100644
index 000000000..db073d91d
--- /dev/null
+++ b/youtube_dl/extractor/zoom.py
@@ -0,0 +1,68 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ js_to_json,
+ parse_filesize,
+ urlencode_postdata,
+)
+
+
+class ZoomIE(InfoExtractor):
+ IE_NAME = 'zoom'
+ _VALID_URL = r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)'
+ _TEST = {
+ 'url': 'https://economist.zoom.us/rec/play/dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5',
+ 'md5': 'ab445e8c911fddc4f9adc842c2c5d434',
+ 'info_dict': {
+ 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5',
+ 'ext': 'mp4',
+ 'title': 'China\'s "two sessions" and the new five-year plan',
+ }
+ }
+
+ def _real_extract(self, url):
+ base_url, play_id = re.match(self._VALID_URL, url).groups()
+ webpage = self._download_webpage(url, play_id)
+
+ try:
+ form = self._form_hidden_inputs('password_form', webpage)
+ except ExtractorError:
+ form = None
+ if form:
+ password = self._downloader.params.get('videopassword')
+ if not password:
+ raise ExtractorError(
+ 'This video is protected by a passcode, use the --video-password option', expected=True)
+ is_meeting = form.get('useWhichPasswd') == 'meeting'
+ validation = self._download_json(
+ base_url + 'rec/validate%s_passwd' % ('_meet' if is_meeting else ''),
+ play_id, 'Validating passcode', 'Wrong passcode', data=urlencode_postdata({
+ 'id': form[('meet' if is_meeting else 'file') + 'Id'],
+ 'passwd': password,
+ 'action': form.get('action'),
+ }))
+ if not validation.get('status'):
+ raise ExtractorError(validation['errorMessage'], expected=True)
+ webpage = self._download_webpage(url, play_id)
+
+ data = self._parse_json(self._search_regex(
+ r'(?s)window\.__data__\s*=\s*({.+?});',
+ webpage, 'data'), play_id, js_to_json)
+
+ return {
+ 'id': play_id,
+ 'title': data['topic'],
+ 'url': data['viewMp4Url'],
+ 'width': int_or_none(data.get('viewResolvtionsWidth')),
+ 'height': int_or_none(data.get('viewResolvtionsHeight')),
+ 'http_headers': {
+ 'Referer': base_url,
+ },
+ 'filesize_approx': parse_filesize(data.get('fileSize')),
+ }
diff --git a/youtube_dl/extractor/zype.py b/youtube_dl/extractor/zype.py
index 5288f40d8..f20f953cb 100644
--- a/youtube_dl/extractor/zype.py
+++ b/youtube_dl/extractor/zype.py
@@ -87,11 +87,16 @@ class ZypeIE(InfoExtractor):
r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1',
body, 'm3u8 url', group='url', default=None)
if not m3u8_url:
- source = self._parse_json(self._search_regex(
- r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body,
- 'source'), video_id, js_to_json)
- if source.get('integration') == 'verizon-media':
- m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id']
+ source = self._search_regex(
+ r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source')
+
+ def get_attr(key):
+ return self._search_regex(
+ r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key,
+ source, key, group='val')
+
+ if get_attr('integration') == 'verizon-media':
+ m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id')
formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
text_tracks = self._search_regex(
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 3000ba41e..0a0641bd4 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -690,6 +690,10 @@ def parseOpts(overrideArguments=None):
dest='outtmpl', metavar='TEMPLATE',
help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info'))
filesystem.add_option(
+ '--output-na-placeholder',
+ dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA',
+ help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")'))
+ filesystem.add_option(
'--autonumber-size',
dest='autonumber_size', metavar='NUMBER', type=int,
help=optparse.SUPPRESS_HELP)
@@ -764,7 +768,7 @@ def parseOpts(overrideArguments=None):
action='store_true', dest='rm_cachedir',
help='Delete all filesystem cache files')
- thumbnail = optparse.OptionGroup(parser, 'Thumbnail images')
+ thumbnail = optparse.OptionGroup(parser, 'Thumbnail Options')
thumbnail.add_option(
'--write-thumbnail',
action='store_true', dest='writethumbnail', default=False,
@@ -782,7 +786,7 @@ def parseOpts(overrideArguments=None):
postproc.add_option(
'-x', '--extract-audio',
action='store_true', dest='extractaudio', default=False,
- help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)')
+ help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)')
postproc.add_option(
'--audio-format', metavar='FORMAT', dest='audioformat', default='best',
help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x')
diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py
index 5a3359588..3990908b6 100644
--- a/youtube_dl/postprocessor/embedthumbnail.py
+++ b/youtube_dl/postprocessor/embedthumbnail.py
@@ -89,10 +89,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
elif info['ext'] in ['m4a', 'mp4']:
- if not check_executable('AtomicParsley', ['-v']):
+ atomicparsley = next((x
+ for x in ['AtomicParsley', 'atomicparsley']
+ if check_executable(x, ['-v'])), None)
+
+ if atomicparsley is None:
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
- cmd = [encodeFilename('AtomicParsley', True),
+ cmd = [encodeFilename(atomicparsley, True),
encodeFilename(filename, True),
encodeArgument('--artwork'),
encodeFilename(thumbnail_filename, True),
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 5f7298345..9f76c9d4e 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -231,7 +231,10 @@ class FFmpegPostProcessor(PostProcessor):
stdout, stderr = p.communicate()
if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace')
- msg = stderr.strip().split('\n')[-1]
+ msgs = stderr.strip().split('\n')
+ msg = msgs[-1]
+ if self._downloader.params.get('verbose', False):
+ self._downloader.to_screen('[debug] ' + '\n'.join(msgs[:-1]))
raise FFmpegPostProcessorError(msg)
self.try_utime(out_path, oldest_mtime, oldest_mtime)
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 8e4d144c9..e722eed58 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -39,6 +39,7 @@ import zlib
from .compat import (
compat_HTMLParseError,
compat_HTMLParser,
+ compat_HTTPError,
compat_basestring,
compat_chr,
compat_cookiejar,
@@ -2879,12 +2880,60 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
- if sys.version_info[0] < 3:
- def redirect_request(self, req, fp, code, msg, headers, newurl):
- # On python 2 urlh.geturl() may sometimes return redirect URL
- # as byte string instead of unicode. This workaround allows
- # to force it always return unicode.
- return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl))
+ """YoutubeDL redirect handler
+
+ The code is based on HTTPRedirectHandler implementation from CPython [1].
+
+ This redirect handler solves two issues:
+ - ensures redirect URL is always unicode under python 2
+ - introduces support for experimental HTTP response status code
+ 308 Permanent Redirect [2] used by some sites [3]
+
+ 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
+ 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
+ 3. https://github.com/ytdl-org/youtube-dl/issues/28768
+ """
+
+ http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
+
+ def redirect_request(self, req, fp, code, msg, headers, newurl):
+ """Return a Request or None in response to a redirect.
+
+ This is called by the http_error_30x methods when a
+ redirection response is received. If a redirection should
+ take place, return a new Request to allow http_error_30x to
+ perform the redirect. Otherwise, raise HTTPError if no-one
+ else should try to handle this url. Return None if you can't
+ but another Handler might.
+ """
+ m = req.get_method()
+ if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
+ or code in (301, 302, 303) and m == "POST")):
+ raise compat_HTTPError(req.full_url, code, msg, headers, fp)
+ # Strictly (according to RFC 2616), 301 or 302 in response to
+ # a POST MUST NOT cause a redirection without confirmation
+ # from the user (of urllib.request, in this case). In practice,
+ # essentially all clients do redirect in this case, so we do
+ # the same.
+
+ # On python 2 urlh.geturl() may sometimes return redirect URL
+ # as byte string instead of unicode. This workaround allows
+ # to force it always return unicode.
+ if sys.version_info[0] < 3:
+ newurl = compat_str(newurl)
+
+ # Be conciliant with URIs containing a space. This is mainly
+ # redundant with the more complete encoding done in http_error_302(),
+ # but it is kept for compatibility with other callers.
+ newurl = newurl.replace(' ', '%20')
+
+ CONTENT_HEADERS = ("content-length", "content-type")
+ # NB: don't use dict comprehension for python 2.6 compatibility
+ newheaders = dict((k, v) for k, v in req.headers.items()
+ if k.lower() not in CONTENT_HEADERS)
+ return compat_urllib_request.Request(
+ newurl, headers=newheaders, origin_req_host=req.origin_req_host,
+ unverifiable=True)
def extract_timezone(date_str):
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 0d9659b2b..b82fbc702 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2021.01.08'
+__version__ = '2021.12.17'