aboutsummaryrefslogtreecommitdiff
path: root/youtube_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r--youtube_dl/extractor/common.py1943
1 files changed, 1431 insertions, 512 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index dcc9d628a..a64fcfccc 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -2,7 +2,9 @@
from __future__ import unicode_literals
import base64
+import collections
import datetime
+import functools
import hashlib
import json
import netrc
@@ -10,16 +12,22 @@ import os
import random
import re
import socket
+import ssl
import sys
import time
import math
from ..compat import (
- compat_cookiejar,
- compat_cookies,
+ compat_cookiejar_Cookie,
+ compat_cookies_SimpleCookie,
+ compat_etree_Element,
compat_etree_fromstring,
compat_getpass,
+ compat_integer_types,
compat_http_client,
+ compat_kwargs,
+ compat_map as map,
+ compat_open as open,
compat_os_name,
compat_str,
compat_urllib_error,
@@ -27,8 +35,13 @@ from ..compat import (
compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
+ compat_zip as zip,
+)
+from ..downloader.f4m import (
+ get_base_url,
+ remove_encrypted_media,
)
-from ..downloader.f4m import remove_encrypted_media
from ..utils import (
NO_DEFAULT,
age_restricted,
@@ -38,6 +51,7 @@ from ..utils import (
compiled_regex_type,
determine_ext,
determine_protocol,
+ dict_get,
error_to_compat_str,
ExtractorError,
extract_attributes,
@@ -46,16 +60,26 @@ from ..utils import (
GeoRestrictedError,
GeoUtils,
int_or_none,
+ join_nonempty,
js_to_json,
+ JSON_LD_RE,
mimetype2ext,
orderedSet,
+ parse_bitrate,
parse_codecs,
parse_duration,
parse_iso8601,
parse_m3u8_attributes,
+ parse_resolution,
RegexNotFoundError,
sanitized_Request,
sanitize_filename,
+ str_or_none,
+ str_to_int,
+ strip_or_none,
+ T,
+ traverse_obj,
+ try_get,
unescapeHTML,
unified_strdate,
unified_timestamp,
@@ -63,6 +87,8 @@ from ..utils import (
update_url_query,
urljoin,
url_basename,
+ url_or_none,
+ variadic,
xpath_element,
xpath_text,
xpath_with_ns,
@@ -95,10 +121,26 @@ class InfoExtractor(object):
from worst to best quality.
Potential fields:
- * url Mandatory. The URL of the video file
+ * url The mandatory URL representing the media:
+ for plain file media - HTTP URL of this file,
+ for RTMP - RTMP URL,
+ for HLS - URL of the M3U8 media playlist,
+ for HDS - URL of the F4M manifest,
+ for DASH
+ - HTTP URL to plain file media (in case of
+ unfragmented media)
+ - URL of the MPD manifest or base URL
+ representing the media if MPD manifest
+ is parsed from a string (in case of
+ fragmented media)
+ for MSS - URL of the ISM manifest.
* manifest_url
The URL of the manifest file in case of
- fragmented media (DASH, hls, hds)
+ fragmented media:
+ for HLS - URL of the M3U8 master playlist,
+ for HDS - URL of the F4M manifest,
+ for DASH - URL of the MPD manifest,
+ for MSS - URL of the ISM manifest.
* ext Will be calculated from URL if missing
* format A human-readable description of the format
("mp4 container with h264/opus").
@@ -142,6 +184,8 @@ class InfoExtractor(object):
fragment_base_url
* "duration" (optional, int or float)
* "filesize" (optional, int)
+ * "range" (optional, str of the form "start-end"
+ to use in HTTP Range header)
* preference Order number of this format. If this field is
present and not None, the formats get sorted
by this field, regardless of all other values.
@@ -170,6 +214,8 @@ class InfoExtractor(object):
width : height ratio as float.
* no_resume The server does not support resuming the
(HTTP or RTMP) download. Boolean.
+ * downloader_options A dictionary of downloader options as
+ described in FileDownloader
url: Final video URL.
ext: Video filename extension.
@@ -189,7 +235,7 @@ class InfoExtractor(object):
* "preference" (optional, int) - quality of the image
* "width" (optional, int)
* "height" (optional, int)
- * "resolution" (optional, string "{width}x{height"},
+ * "resolution" (optional, string "{width}x{height}",
deprecated)
* "filesize" (optional, int)
thumbnail: Full URL to a video thumbnail image.
@@ -197,12 +243,19 @@ class InfoExtractor(object):
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
+ release_timestamp: UNIX timestamp of the moment the video was released.
release_date: The date (YYYYMMDD) when the video was released.
- timestamp: UNIX timestamp of the moment the video became available.
+ timestamp: UNIX timestamp of the moment the video became available
+ (uploaded).
upload_date: Video upload date (YYYYMMDD).
If not explicitly set, calculated from timestamp.
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
+ channel: Full name of the channel the video is uploaded on.
+ Note that channel fields may or may not repeat uploader
+ fields. This depends on a particular extractor.
+ channel_id: Id of the channel.
+ channel_url: Full URL to a channel webpage.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
@@ -245,6 +298,10 @@ class InfoExtractor(object):
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ chapters: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the chapter in seconds
+ * "end_time" - The end time of the chapter in seconds
+ * "title" (optional, string)
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -293,8 +350,9 @@ class InfoExtractor(object):
There must be a key "entries", which is a list, an iterable, or a PagedList
object, each element of which is a valid dictionary by this specification.
- Additionally, playlists can have "title", "description" and "id" attributes
- with the same semantics as videos (see above).
+ Additionally, playlists can have "id", "title", "description", "uploader",
+ "uploader_id", "uploader_url", "duration" attributes with the same semantics
+ as videos (see above).
_type "multi_video" indicates that there are multiple videos that
@@ -321,22 +379,37 @@ class InfoExtractor(object):
title, description etc.
- Subclasses of this one should re-define the _real_initialize() and
- _real_extract() methods and define a _VALID_URL regexp.
- Probably, they should also be added to the list of extractors.
+ A subclass of InfoExtractor must be defined to handle each specific site (or
+ several sites). Such a concrete subclass should be added to the list of
+ extractors. It should also:
+ * define its _VALID_URL attribute as a regexp, or a Sequence of alternative
+ regexps (but see below)
+ * re-define the _real_extract() method
+ * optionally re-define the _real_initialize() method.
+
+ An extractor subclass may also override suitable() if necessary, but the
+ function signature must be preserved and the function must import everything
+ it needs (except other extractors), so that lazy_extractors works correctly.
+ If the subclass's suitable() and _real_extract() functions avoid using
+ _VALID_URL, the subclass need not set that class attribute.
+
+ An abstract subclass of InfoExtractor may be used to simplify implementation
+ within an extractor module; it should not be added to the list of extractors.
_GEO_BYPASS attribute may be set to False in order to disable
geo restriction bypass mechanisms for a particular extractor.
Though it won't disable explicit geo restriction bypass based on
- country code provided with geo_bypass_country. (experimental)
+ country code provided with geo_bypass_country.
_GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
countries for this extractor. One of these countries will be used by
geo restriction bypass mechanism right away in order to bypass
- geo restriction, of course, if the mechanism is not disabled. (experimental)
+ geo restriction, of course, if the mechanism is not disabled.
- NB: both these geo attributes are experimental and may change in future
- or be completely removed.
+ _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
+ IP blocks in CIDR notation for this extractor. One of these IP blocks
+ will be used by geo restriction bypass mechanism similarly
+ to _GEO_COUNTRIES.
Finally, the _WORKING attribute should be set to False for broken IEs
in order to warn the users and skip the tests.
@@ -347,7 +420,10 @@ class InfoExtractor(object):
_x_forwarded_for_ip = None
_GEO_BYPASS = True
_GEO_COUNTRIES = None
+ _GEO_IP_BLOCKS = None
_WORKING = True
+ # supply this in public subclasses: used in supported sites list, etc
+ # IE_DESC = 'short description of IE'
def __init__(self, downloader=None):
"""Constructor. Receives an optional downloader."""
@@ -356,23 +432,34 @@ class InfoExtractor(object):
self.set_downloader(downloader)
@classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
-
+ def __match_valid_url(cls, url):
# This does not use has/getattr intentionally - we want to know whether
- # we have cached the regexp for *this* class, whereas getattr would also
- # match the superclass
+ # we have cached the regexp for cls, whereas getattr would also
+ # match its superclass
if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- return cls._VALID_URL_RE.match(url) is not None
+ # _VALID_URL can now be a list/tuple of patterns
+ cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+ # 20% faster than next(filter(None, (p.match(url) for p in cls._VALID_URL_RE)), None) in 2.7
+ for p in cls._VALID_URL_RE:
+ p = p.match(url)
+ if p:
+ return p
+
+ # The public alias can safely be overridden, as in some back-ports
+ _match_valid_url = __match_valid_url
+
+ @classmethod
+ def suitable(cls, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ # This function must import everything it needs (except other extractors),
+ # so that lazy_extractors works correctly
+ return cls.__match_valid_url(url) is not None
@classmethod
def _match_id(cls, url):
- if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- m = cls._VALID_URL_RE.match(url)
+ m = cls.__match_valid_url(url)
assert m
- return m.group('id')
+ return compat_str(m.group('id'))
@classmethod
def working(cls):
@@ -381,12 +468,15 @@ class InfoExtractor(object):
def initialize(self):
"""Initializes an instance (authentication, etc)."""
- self._initialize_geo_bypass(self._GEO_COUNTRIES)
+ self._initialize_geo_bypass({
+ 'countries': self._GEO_COUNTRIES,
+ 'ip_blocks': self._GEO_IP_BLOCKS,
+ })
if not self._ready:
self._real_initialize()
self._ready = True
- def _initialize_geo_bypass(self, countries):
+ def _initialize_geo_bypass(self, geo_bypass_context):
"""
Initialize geo restriction bypass mechanism.
@@ -397,28 +487,82 @@ class InfoExtractor(object):
HTTP requests.
This method will be used for initial geo bypass mechanism initialization
- during the instance initialization with _GEO_COUNTRIES.
+ during the instance initialization with _GEO_COUNTRIES and
+ _GEO_IP_BLOCKS.
- You may also manually call it from extractor's code if geo countries
+ You may also manually call it from extractor's code if geo bypass
information is not available beforehand (e.g. obtained during
- extraction) or due to some another reason.
+ extraction) or due to some other reason. In this case you should pass
+ this information in geo bypass context passed as first argument. It may
+ contain following fields:
+
+ countries: List of geo unrestricted countries (similar
+ to _GEO_COUNTRIES)
+ ip_blocks: List of geo unrestricted IP blocks in CIDR notation
+ (similar to _GEO_IP_BLOCKS)
+
"""
if not self._x_forwarded_for_ip:
- country_code = self._downloader.params.get('geo_bypass_country', None)
- # If there is no explicit country for geo bypass specified and
- # the extractor is known to be geo restricted let's fake IP
- # as X-Forwarded-For right away.
- if (not country_code and
- self._GEO_BYPASS and
- self._downloader.params.get('geo_bypass', True) and
- countries):
- country_code = random.choice(countries)
- if country_code:
- self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
- if self._downloader.params.get('verbose', False):
- self._downloader.to_stdout(
+
+ # Geo bypass mechanism is explicitly disabled by user
+ if not self.get_param('geo_bypass', True):
+ return
+
+ if not geo_bypass_context:
+ geo_bypass_context = {}
+
+ # Backward compatibility: previously _initialize_geo_bypass
+ # expected a list of countries, some 3rd party code may still use
+ # it this way
+ if isinstance(geo_bypass_context, (list, tuple)):
+ geo_bypass_context = {
+ 'countries': geo_bypass_context,
+ }
+
+ # The whole point of geo bypass mechanism is to fake IP
+ # as X-Forwarded-For HTTP header based on some IP block or
+ # country code.
+
+ # Path 1: bypassing based on IP block in CIDR notation
+
+ # Explicit IP block specified by user, use it right away
+ # regardless of whether extractor is geo bypassable or not
+ ip_block = self.get_param('geo_bypass_ip_block', None)
+
+ # Otherwise use random IP block from geo bypass context but only
+ # if extractor is known as geo bypassable
+ if not ip_block:
+ ip_blocks = geo_bypass_context.get('ip_blocks')
+ if self._GEO_BYPASS and ip_blocks:
+ ip_block = random.choice(ip_blocks)
+
+ if ip_block:
+ self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
+ if self.get_param('verbose', False):
+ self.to_screen(
+ '[debug] Using fake IP %s as X-Forwarded-For.'
+ % self._x_forwarded_for_ip)
+ return
+
+ # Path 2: bypassing based on country code
+
+ # Explicit country code specified by user, use it right away
+ # regardless of whether extractor is geo bypassable or not
+ country = self.get_param('geo_bypass_country', None)
+
+ # Otherwise use random country code from geo bypass context but
+ # only if extractor is known as geo bypassable
+ if not country:
+ countries = geo_bypass_context.get('countries')
+ if self._GEO_BYPASS and countries:
+ country = random.choice(countries)
+
+ if country:
+ self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
+ if self.get_param('verbose', False):
+ self.to_screen(
'[debug] Using fake IP %s (%s) as X-Forwarded-For.'
- % (self._x_forwarded_for_ip, country_code.upper()))
+ % (self._x_forwarded_for_ip, country.upper()))
def extract(self, url):
"""Extracts URL information and returns it in list of dicts."""
@@ -442,11 +586,11 @@ class InfoExtractor(object):
raise ExtractorError('An extractor error has occurred.', cause=e)
def __maybe_fake_ip_and_retry(self, countries):
- if (not self._downloader.params.get('geo_bypass_country', None) and
- self._GEO_BYPASS and
- self._downloader.params.get('geo_bypass', True) and
- not self._x_forwarded_for_ip and
- countries):
+ if (not self.get_param('geo_bypass_country', None)
+ and self._GEO_BYPASS
+ and self.get_param('geo_bypass', True)
+ and not self._x_forwarded_for_ip
+ and countries):
country_code = random.choice(countries)
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
if self._x_forwarded_for_ip:
@@ -460,6 +604,14 @@ class InfoExtractor(object):
"""Sets the downloader for this IE."""
self._downloader = downloader
+ @property
+ def cache(self):
+ return self._downloader.cache
+
+ @property
+ def cookiejar(self):
+ return self._downloader.cookiejar
+
def _real_initialize(self):
"""Real initialization process. Redefine in subclasses."""
pass
@@ -477,8 +629,26 @@ class InfoExtractor(object):
def IE_NAME(self):
return compat_str(type(self).__name__[:-2])
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
- """ Returns the response handle """
+ @staticmethod
+ def __can_accept_status_code(err, expected_status):
+ assert isinstance(err, compat_urllib_error.HTTPError)
+ if expected_status is None:
+ return False
+ if isinstance(expected_status, compat_integer_types):
+ return err.code == expected_status
+ elif isinstance(expected_status, (list, tuple)):
+ return err.code in expected_status
+ elif callable(expected_status):
+ return expected_status(err.code) is True
+ else:
+ assert False
+
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
+ """
+ Return the response handle.
+
+ See _download_webpage docstring for arguments specification.
+ """
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
@@ -486,6 +656,16 @@ class InfoExtractor(object):
self.to_screen('%s' % (note,))
else:
self.to_screen('%s: %s' % (video_id, note))
+
+ # Some sites check X-Forwarded-For HTTP header in order to figure out
+ # the origin of the client behind proxy. This allows bypassing geo
+ # restriction by faking this header's value to IP that belongs to some
+ # geo unrestricted country. We will do so once we encounter any
+ # geo restriction error.
+ if self._x_forwarded_for_ip:
+ if 'X-Forwarded-For' not in headers:
+ headers['X-Forwarded-For'] = self._x_forwarded_for_ip
+
if isinstance(url_or_request, compat_urllib_request.Request):
url_or_request = update_Request(
url_or_request, data=data, headers=headers, query=query)
@@ -494,9 +674,21 @@ class InfoExtractor(object):
url_or_request = update_url_query(url_or_request, query)
if data is not None or headers:
url_or_request = sanitized_Request(url_or_request, data, headers)
+ exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
+ if hasattr(ssl, 'CertificateError'):
+ exceptions.append(ssl.CertificateError)
try:
return self._downloader.urlopen(url_or_request)
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ except tuple(exceptions) as err:
+ if isinstance(err, compat_urllib_error.HTTPError):
+ if self.__can_accept_status_code(err, expected_status):
+ # Retain reference to error to prevent file object from
+ # being closed before it can be read. Works around the
+ # effects of <https://bugs.python.org/issue15002>
+ # introduced in Python 3.4.1.
+ err.fp._error = err
+ return err.fp
+
if errnote is False:
return False
if errnote is None:
@@ -506,25 +698,20 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
else:
- self._downloader.report_warning(errmsg)
+ self.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
- """ Returns a tuple (page content as string, URL handle) """
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ """
+ Return a tuple (page content as string, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- # Some sites check X-Forwarded-For HTTP header in order to figure out
- # the origin of the client behind proxy. This allows bypassing geo
- # restriction by faking this header's value to IP that belongs to some
- # geo unrestricted country. We will do so once we encounter any
- # geo restriction error.
- if self._x_forwarded_for_ip:
- if 'X-Forwarded-For' not in headers:
- headers['X-Forwarded-For'] = self._x_forwarded_for_ip
-
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
if urlh is False:
assert not fatal
return False
@@ -550,8 +737,8 @@ class InfoExtractor(object):
def __check_blocked(self, content):
first_block = content[:512]
- if ('<title>Access to this site is blocked</title>' in content and
- 'Websense' in first_block):
+ if ('<title>Access to this site is blocked</title>' in content
+ and 'Websense' in first_block):
msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
blocked_iframe = self._html_search_regex(
r'<iframe src="([^"]+)"', content,
@@ -569,8 +756,8 @@ class InfoExtractor(object):
if block_msg:
msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
raise ExtractorError(msg, expected=True)
- if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
- 'blocklist.rkn.gov.ru' in content):
+ if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
+ and 'blocklist.rkn.gov.ru' in content):
raise ExtractorError(
'Access to this webpage has been blocked by decision of the Russian government. '
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
@@ -583,20 +770,12 @@ class InfoExtractor(object):
webpage_bytes = prefix + webpage_bytes
if not encoding:
encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
- if self._downloader.params.get('dump_intermediate_pages', False):
- try:
- url = url_or_request.get_full_url()
- except AttributeError:
- url = url_or_request
- self.to_screen('Dumping request to ' + url)
+ if self.get_param('dump_intermediate_pages', False):
+ self.to_screen('Dumping request to ' + urlh.geturl())
dump = base64.b64encode(webpage_bytes).decode('ascii')
- self._downloader.to_screen(dump)
- if self._downloader.params.get('write_pages', False):
- try:
- url = url_or_request.get_full_url()
- except AttributeError:
- url = url_or_request
- basen = '%s_%s' % (video_id, url)
+ self.to_screen(dump)
+ if self.get_param('write_pages', False):
+ basen = '%s_%s' % (video_id, urlh.geturl())
if len(basen) > 240:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
basen = basen[:240 - len(h)] + h
@@ -621,13 +800,52 @@ class InfoExtractor(object):
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
- """ Returns the data of the page as a string """
+ def _download_webpage(
+ self, url_or_request, video_id, note=None, errnote=None,
+ fatal=True, tries=1, timeout=5, encoding=None, data=None,
+ headers={}, query={}, expected_status=None):
+ """
+ Return the data of the page as a string.
+
+ Arguments:
+ url_or_request -- plain text URL as a string or
+ a compat_urllib_request.Requestobject
+ video_id -- Video/playlist/item identifier (string)
+
+ Keyword arguments:
+ note -- note printed before downloading (string)
+ errnote -- note printed in case of an error (string)
+ fatal -- flag denoting whether error should be considered fatal,
+ i.e. whether it should cause ExtractionError to be raised,
+ otherwise a warning will be reported and extraction continued
+ tries -- number of tries
+ timeout -- sleep interval between tries
+ encoding -- encoding for a page content decoding, guessed automatically
+ when not explicitly specified
+ data -- POST data (bytes)
+ headers -- HTTP headers (dict)
+ query -- URL query (dict)
+ expected_status -- allows to accept failed HTTP requests (non 2xx
+ status code) by explicitly specifying a set of accepted status
+ codes. Can be any of the following entities:
+ - an integer type specifying an exact failed status code to
+ accept
+ - a list or a tuple of integer types specifying a list of
+ failed status codes to accept
+ - a callable accepting an actual failed status code and
+ returning True if it should be accepted
+ Note that this argument does not affect success status codes (2xx)
+ which are always accepted.
+ """
+
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
+ res = self._download_webpage_handle(
+ url_or_request, video_id, note, errnote, fatal,
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
@@ -640,30 +858,93 @@ class InfoExtractor(object):
content, _ = res
return content
- def _download_xml(self, url_or_request, video_id,
- note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
- """Return the xml as an xml.etree.ElementTree.Element"""
- xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
- if xml_string is False:
- return xml_string
+ def _download_xml_handle(
+ self, url_or_request, video_id, note='Downloading XML',
+ errnote='Unable to download XML', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (xml as an compat_etree_Element, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
+ res = self._download_webpage_handle(
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
+ if res is False:
+ return res
+ xml_string, urlh = res
+ return self._parse_xml(
+ xml_string, video_id, transform_source=transform_source,
+ fatal=fatal), urlh
+
+ def _download_xml(
+ self, url_or_request, video_id,
+ note='Downloading XML', errnote='Unable to download XML',
+ transform_source=None, fatal=True, encoding=None,
+ data=None, headers={}, query={}, expected_status=None):
+ """
+ Return the xml as an compat_etree_Element.
+
+ See _download_webpage docstring for arguments specification.
+ """
+ res = self._download_xml_handle(
+ url_or_request, video_id, note=note, errnote=errnote,
+ transform_source=transform_source, fatal=fatal, encoding=encoding,
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
+ return res if res is False else res[0]
+
+ def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
if transform_source:
xml_string = transform_source(xml_string)
- return compat_etree_fromstring(xml_string.encode('utf-8'))
-
- def _download_json(self, url_or_request, video_id,
- note='Downloading JSON metadata',
- errnote='Unable to download JSON metadata',
- transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
- json_string = self._download_webpage(
+ try:
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
+ except compat_xml_parse_error as ve:
+ errmsg = '%s: Failed to parse XML ' % video_id
+ if fatal:
+ raise ExtractorError(errmsg, cause=ve)
+ else:
+ self.report_warning(errmsg + str(ve))
+
+ def _download_json_handle(
+ self, url_or_request, video_id, note='Downloading JSON metadata',
+ errnote='Unable to download JSON metadata', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (JSON object, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
+ res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query)
- if (not fatal) and json_string is False:
- return None
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
+ if res is False:
+ return res
+ json_string, urlh = res
return self._parse_json(
- json_string, video_id, transform_source=transform_source, fatal=fatal)
+ json_string, video_id, transform_source=transform_source,
+ fatal=fatal), urlh
+
+ def _download_json(
+ self, url_or_request, video_id, note='Downloading JSON metadata',
+ errnote='Unable to download JSON metadata', transform_source=None,
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return the JSON object as a dict.
+
+ See _download_webpage docstring for arguments specification.
+ """
+ res = self._download_json_handle(
+ url_or_request, video_id, note=note, errnote=errnote,
+ transform_source=transform_source, fatal=fatal, encoding=encoding,
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
+ return res if res is False else res[0]
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
if transform_source:
@@ -677,14 +958,37 @@ class InfoExtractor(object):
else:
self.report_warning(errmsg + str(ve))
- def report_warning(self, msg, video_id=None):
+ def __ie_msg(self, *msg):
+ return '[{0}] {1}'.format(self.IE_NAME, ''.join(msg))
+
+ # msg, video_id=None, *args, only_once=False, **kwargs
+ def report_warning(self, msg, *args, **kwargs):
+ if len(args) > 0:
+ video_id = args[0]
+ args = args[1:]
+ else:
+ video_id = kwargs.pop('video_id', None)
idstr = '' if video_id is None else '%s: ' % video_id
self._downloader.report_warning(
- '[%s] %s%s' % (self.IE_NAME, idstr, msg))
+ self.__ie_msg(idstr, msg), *args, **kwargs)
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
- self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
+ self._downloader.to_screen(self.__ie_msg(msg))
+
+ def write_debug(self, msg, only_once=False):
+ '''Log debug message or Print message to stderr'''
+ self._downloader.write_debug(self.__ie_msg(msg), only_once=only_once)
+
+ # name, default=None, *args, **kwargs
+ def get_param(self, name, *args, **kwargs):
+ default, args = (args[0], args[1:]) if len(args) > 0 else (kwargs.pop('default', None), args)
+ if self._downloader:
+ return self._downloader.params.get(name, default, *args, **kwargs)
+ return default
+
+ def report_drm(self, video_id):
+ self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id)
def report_extraction(self, id_or_name):
"""Report information extraction."""
@@ -712,6 +1016,15 @@ class InfoExtractor(object):
def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
raise GeoRestrictedError(msg, countries=countries)
+ def raise_no_formats(self, msg, expected=False, video_id=None):
+ if expected and (
+ self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
+ self.report_warning(msg, video_id)
+ elif isinstance(msg, ExtractorError):
+ raise msg
+ else:
+ raise ExtractorError(msg, expected=expected, video_id=video_id)
+
# Methods for following #608
@staticmethod
def url_result(url, ie=None, video_id=None, video_title=None):
@@ -726,12 +1039,12 @@ class InfoExtractor(object):
video_info['title'] = video_title
return video_info
- def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
- urlrs = orderedSet(
+ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
+ urls = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
- urlrs, playlist_id=video_id, playlist_title=video_title)
+ urls, playlist_id=playlist_id, playlist_title=playlist_title)
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
@@ -761,7 +1074,7 @@ class InfoExtractor(object):
if mobj:
break
- if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
+ if not self.get_param('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
_name = '\033[0;34m%s\033[0m' % name
else:
_name = name
@@ -770,6 +1083,8 @@ class InfoExtractor(object):
if group is None:
# return the first matching group
return next(g for g in mobj.groups() if g is not None)
+ elif isinstance(group, (list, tuple)):
+ return tuple(mobj.group(g) for g in group)
else:
return mobj.group(group)
elif default is not NO_DEFAULT:
@@ -777,26 +1092,79 @@ class InfoExtractor(object):
elif fatal:
raise RegexNotFoundError('Unable to extract %s' % _name)
else:
- self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
+ self.report_warning('unable to extract %s' % _name + bug_reports_message())
return None
+ def _search_json(self, start_pattern, string, name, video_id, **kwargs):
+ """Searches string for the JSON object specified by start_pattern"""
+
+ # self, start_pattern, string, name, video_id, *, end_pattern='',
+ # contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT
+ # NB: end_pattern is only used to reduce the size of the initial match
+ end_pattern = kwargs.pop('end_pattern', '')
+ # (?:[\s\S]) simulates (?(s):.) (eg)
+ contains_pattern = kwargs.pop('contains_pattern', r'{[\s\S]+}')
+ fatal = kwargs.pop('fatal', True)
+ default = kwargs.pop('default', NO_DEFAULT)
+
+ if default is NO_DEFAULT:
+ default, has_default = {}, False
+ else:
+ fatal, has_default = False, True
+
+ json_string = self._search_regex(
+ r'(?:{0})\s*(?P<json>{1})\s*(?:{2})'.format(
+ start_pattern, contains_pattern, end_pattern),
+ string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT)
+ if not json_string:
+ return default
+
+ # yt-dlp has a special JSON parser that allows trailing text.
+ # Until that arrives here, the diagnostic from the exception
+ # raised by json.loads() is used to extract the wanted text.
+ # Either way, it's a problem if a transform_source() can't
+ # handle the trailing text.
+
+ # force an exception
+ kwargs['fatal'] = True
+
+ # self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
+ for _ in range(2):
+ try:
+ # return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs)
+ transform_source = kwargs.pop('transform_source', None)
+ if transform_source:
+ json_string = transform_source(json_string)
+ return self._parse_json(json_string, video_id, **compat_kwargs(kwargs))
+ except ExtractorError as e:
+ end = int_or_none(self._search_regex(r'\(char\s+(\d+)', error_to_compat_str(e), 'end', default=None))
+ if end is not None:
+ json_string = json_string[:end]
+ continue
+ msg = 'Unable to extract {0} - Failed to parse JSON'.format(name)
+ if fatal:
+ raise ExtractorError(msg, cause=e.cause, video_id=video_id)
+ elif not has_default:
+ self.report_warning(
+ '{0}: {1}'.format(msg, error_to_compat_str(e)), video_id=video_id)
+ return default
+
def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
"""
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
- if res:
- return clean_html(res).strip()
- else:
- return res
+ if isinstance(res, tuple):
+ return tuple(map(clean_html, res))
+ return clean_html(res)
def _get_netrc_login_info(self, netrc_machine=None):
username = None
password = None
- netrc_machine = netrc_machine or self._NETRC_MACHINE
- if self._downloader.params.get('usenetrc', False):
+ if self.get_param('usenetrc', False):
try:
+ netrc_machine = netrc_machine or self._NETRC_MACHINE
info = netrc.netrc().authenticators(netrc_machine)
if info is not None:
username = info[0]
@@ -804,8 +1172,8 @@ class InfoExtractor(object):
else:
raise netrc.NetrcParseError(
'No authenticators for %s' % netrc_machine)
- except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(
+ except (AttributeError, IOError, netrc.NetrcParseError) as err:
+ self.report_warning(
'parsing .netrc: %s' % error_to_compat_str(err))
return username, password
@@ -842,18 +1210,18 @@ class InfoExtractor(object):
"""
if self._downloader is None:
return None
- downloader_params = self._downloader.params
- if downloader_params.get('twofactor') is not None:
- return downloader_params['twofactor']
+ twofactor = self.get_param('twofactor')
+ if twofactor is not None:
+ return twofactor
return compat_getpass('Type %s and press [Return]: ' % note)
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
- property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
+ property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
% {'prop': re.escape(prop)})
template = r'<meta[^>]+?%s[^>]+?%s'
return [
@@ -936,7 +1304,8 @@ class InfoExtractor(object):
def _family_friendly_search(self, html):
# See http://schema.org/VideoObject
- family_friendly = self._html_search_meta('isFamilyFriendly', html)
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', html, default=None)
if not family_friendly:
return None
@@ -954,17 +1323,33 @@ class InfoExtractor(object):
'twitter card player')
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
- json_ld = self._search_regex(
- r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
- html, 'JSON-LD', group='json_ld', **kwargs)
+ json_ld_list = list(re.finditer(JSON_LD_RE, html))
default = kwargs.get('default', NO_DEFAULT)
- if not json_ld:
- return default if default is not NO_DEFAULT else {}
# JSON-LD may be malformed and thus `fatal` should be respected.
# At the same time `default` may be passed that assumes `fatal=False`
# for _search_regex. Let's simulate the same behavior here as well.
fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
- return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+ json_ld = []
+ for mobj in json_ld_list:
+ json_ld_item = self._parse_json(
+ mobj.group('json_ld'), video_id, fatal=fatal)
+ if not json_ld_item:
+ continue
+ if isinstance(json_ld_item, dict):
+ json_ld.append(json_ld_item)
+ elif isinstance(json_ld_item, (list, tuple)):
+ json_ld.extend(json_ld_item)
+ if json_ld:
+ json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+ if json_ld:
+ return json_ld
+ if default is not NO_DEFAULT:
+ return default
+ elif fatal:
+ raise RegexNotFoundError('Unable to extract JSON-LD')
+ else:
+ self.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
+ return {}
def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
if isinstance(json_ld, compat_str):
@@ -976,45 +1361,168 @@ class InfoExtractor(object):
return info
if isinstance(json_ld, dict):
json_ld = [json_ld]
+
+ INTERACTION_TYPE_MAP = {
+ 'CommentAction': 'comment',
+ 'AgreeAction': 'like',
+ 'DisagreeAction': 'dislike',
+ 'LikeAction': 'like',
+ 'DislikeAction': 'dislike',
+ 'ListenAction': 'view',
+ 'WatchAction': 'view',
+ 'ViewAction': 'view',
+ }
+
+ def extract_interaction_type(e):
+ interaction_type = e.get('interactionType')
+ if isinstance(interaction_type, dict):
+ interaction_type = interaction_type.get('@type')
+ return str_or_none(interaction_type)
+
+ def extract_interaction_statistic(e):
+ interaction_statistic = e.get('interactionStatistic')
+ if isinstance(interaction_statistic, dict):
+ interaction_statistic = [interaction_statistic]
+ if not isinstance(interaction_statistic, list):
+ return
+ for is_e in interaction_statistic:
+ if not isinstance(is_e, dict):
+ continue
+ if is_e.get('@type') != 'InteractionCounter':
+ continue
+ interaction_type = extract_interaction_type(is_e)
+ if not interaction_type:
+ continue
+ # For interaction count some sites provide string instead of
+ # an integer (as per spec) with non digit characters (e.g. ",")
+ # so extracting count with more relaxed str_to_int
+ interaction_count = str_to_int(is_e.get('userInteractionCount'))
+ if interaction_count is None:
+ continue
+ count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
+ if not count_kind:
+ continue
+ count_key = '%s_count' % count_kind
+ if info.get(count_key) is not None:
+ continue
+ info[count_key] = interaction_count
+
+ def extract_video_object(e):
+ assert e['@type'] == 'VideoObject'
+ author = e.get('author')
+ info.update({
+ 'url': url_or_none(e.get('contentUrl')),
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
+ # author can be an instance of 'Organization' or 'Person' types.
+ # both types can have 'name' property(inherited from 'Thing' type). [1]
+ # however some websites are using 'Text' type instead.
+ # 1. https://schema.org/VideoObject
+ 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
+ 'filesize': float_or_none(e.get('contentSize')),
+ 'tbr': int_or_none(e.get('bitrate')),
+ 'width': int_or_none(e.get('width')),
+ 'height': int_or_none(e.get('height')),
+ 'view_count': int_or_none(e.get('interactionCount')),
+ })
+ extract_interaction_statistic(e)
+
for e in json_ld:
- if e.get('@context') == 'http://schema.org':
+ if '@context' in e:
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
- return info
- if item_type == 'TVEpisode':
+ continue
+ if item_type in ('TVEpisode', 'Episode'):
+ episode_name = unescapeHTML(e.get('name'))
info.update({
- 'episode': unescapeHTML(e.get('name')),
+ 'episode': episode_name,
'episode_number': int_or_none(e.get('episodeNumber')),
'description': unescapeHTML(e.get('description')),
})
+ if not info.get('title') and episode_name:
+ info['title'] = episode_name
part_of_season = e.get('partOfSeason')
- if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
- info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+ if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
+ info.update({
+ 'season': unescapeHTML(part_of_season.get('name')),
+ 'season_number': int_or_none(part_of_season.get('seasonNumber')),
+ })
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
- if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+ if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name'))
- elif item_type == 'Article':
+ elif item_type == 'Movie':
+ info.update({
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('dateCreated')),
+ })
+ elif item_type in ('Article', 'NewsArticle'):
info.update({
'timestamp': parse_iso8601(e.get('datePublished')),
'title': unescapeHTML(e.get('headline')),
'description': unescapeHTML(e.get('articleBody')),
})
elif item_type == 'VideoObject':
- info.update({
- 'url': e.get('contentUrl'),
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('uploadDate')),
- 'filesize': float_or_none(e.get('contentSize')),
- 'tbr': int_or_none(e.get('bitrate')),
- 'width': int_or_none(e.get('width')),
- 'height': int_or_none(e.get('height')),
- })
- break
+ extract_video_object(e)
+ if expected_type is None:
+ continue
+ else:
+ break
+ video = e.get('video')
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+ extract_video_object(video)
+ if expected_type is None:
+ continue
+ else:
+ break
return dict((k, v) for k, v in info.items() if v is not None)
+ def _search_nextjs_data(self, webpage, video_id, **kw):
+ # ..., *, transform_source=None, fatal=True, default=NO_DEFAULT
+
+ # TODO: remove this backward compat
+ default = kw.get('default', NO_DEFAULT)
+ if default == '{}':
+ kw['default'] = {}
+ kw = compat_kwargs(kw)
+
+ return self._search_json(
+ r'''<script\s[^>]*?\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>''',
+ webpage, 'next.js data', video_id, end_pattern='</script>',
+ **kw)
+
+ def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
+ """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
+
+ # self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)
+ context_name = args[0] if len(args) > 0 else kwargs.get('context_name', '__NUXT__')
+ fatal = kwargs.get('fatal', True)
+ traverse = kwargs.get('traverse', ('data', 0))
+
+ re_ctx = re.escape(context_name)
+
+ FUNCTION_RE = (r'\(\s*function\s*\((?P<arg_keys>[\s\S]*?)\)\s*\{\s*'
+ r'return\s+(?P<js>\{[\s\S]*?})\s*;?\s*}\s*\((?P<arg_vals>[\s\S]*?)\)')
+
+ js, arg_keys, arg_vals = self._search_regex(
+ (p.format(re_ctx, FUNCTION_RE) for p in
+ (r'<script>\s*window\s*\.\s*{0}\s*=\s*{1}\s*\)\s*;?\s*</script>',
+ r'{0}\s*\([\s\S]*?{1}')),
+ webpage, context_name, group=('js', 'arg_keys', 'arg_vals'),
+ default=NO_DEFAULT if fatal else (None, None, None))
+ if js is None:
+ return {}
+
+ args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
+ '[{0}]'.format(arg_vals), video_id, transform_source=js_to_json, fatal=fatal) or ())))
+
+ ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
+ return traverse_obj(ret, traverse) or {}
+
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@@ -1071,7 +1579,7 @@ class InfoExtractor(object):
if f.get('vcodec') == 'none': # audio only
preference -= 50
- if self._downloader.params.get('prefer_free_formats'):
+ if self.get_param('prefer_free_formats'):
ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
else:
ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
@@ -1083,7 +1591,7 @@ class InfoExtractor(object):
else:
if f.get('acodec') == 'none': # video only
preference -= 40
- if self._downloader.params.get('prefer_free_formats'):
+ if self.get_param('prefer_free_formats'):
ORDER = ['flv', 'mp4', 'webm']
else:
ORDER = ['webm', 'flv', 'mp4']
@@ -1140,17 +1648,16 @@ class InfoExtractor(object):
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
except ExtractorError as e:
- if isinstance(e.cause, compat_urllib_error.URLError):
- self.to_screen(
- '%s: %s URL is invalid, skipping' % (video_id, item))
- return False
- raise
+ self.to_screen(
+ '%s: %s URL is invalid, skipping: %s'
+ % (video_id, item, error_to_compat_str(e.cause)))
+ return False
def http_scheme(self):
""" Either "http:" or "https:", depending on the user's preferences """
return (
'http:'
- if self._downloader.params.get('prefer_insecure', False)
+ if self.get_param('prefer_insecure', False)
else 'https:')
def _proto_relative_url(self, url, scheme=None):
@@ -1172,14 +1679,14 @@ class InfoExtractor(object):
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
- fatal=True, m3u8_id=None):
+ fatal=True, m3u8_id=None, data=None, headers={}, query={}):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
- # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+ # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
transform_source=transform_source,
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
if manifest is False:
return []
@@ -1191,6 +1698,9 @@ class InfoExtractor(object):
def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
fatal=True, m3u8_id=None):
+ if not isinstance(manifest, compat_etree_Element) and not fatal:
+ return []
+
# currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
if akamai_pv is not None and ';' in akamai_pv.text:
@@ -1205,15 +1715,12 @@ class InfoExtractor(object):
manifest_version = '2.0'
media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
# Remove unsupported DRM protected media from final formats
- # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
+ # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
media_nodes = remove_encrypted_media(media_nodes)
if not media_nodes:
return formats
- base_url = xpath_text(
- manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
- 'base URL', default=None)
- if base_url:
- base_url = base_url.strip()
+
+ manifest_base_url = get_base_url(manifest)
bootstrap_info = xpath_element(
manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
@@ -1245,7 +1752,7 @@ class InfoExtractor(object):
continue
manifest_url = (
media_url if media_url.startswith('http://') or media_url.startswith('https://')
- else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
+ else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
# If media_url is itself a f4m manifest do the recursive extraction
# since bitrates in parent manifest (this one) and media_url manifest
# may differ leading to inability to resolve the format by requested
@@ -1280,6 +1787,7 @@ class InfoExtractor(object):
'url': manifest_url,
'manifest_url': manifest_url,
'ext': 'flv' if bootstrap_info is not None else None,
+ 'protocol': 'f4m',
'tbr': tbr,
'width': width,
'height': height,
@@ -1299,44 +1807,65 @@ class InfoExtractor(object):
'format_note': 'Quality selection URL',
}
+ def _report_ignoring_subs(self, name):
+ self.report_warning(bug_reports_message(
+ 'Ignoring subtitle tracks found in the {0} manifest; '
+ 'if any subtitle tracks are missing,'.format(name)
+ ), only_once=True)
+
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
- fatal=True, live=False):
-
+ fatal=True, live=False, data=None, headers={},
+ query={}):
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
+
if res is False:
return []
+
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
+ return self._parse_m3u8_formats(
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+ preference=preference, m3u8_id=m3u8_id, live=live)
+
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+ if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
+ return []
+
+ formats = []
format_url = lambda u: (
u
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- # We should try extracting formats only from master playlists [1], i.e.
- # playlists that describe available qualities. On the other hand media
- # playlists [2] should be returned as is since they contain just the media
- # without qualities renditions.
+ # References:
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+ # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
+ # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
+
+ # We should try extracting formats only from master playlists [1, 4.3.4],
+ # i.e. playlists that describe available qualities. On the other hand
+ # media playlists [1, 4.3.3] should be returned as is since they contain
+ # just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
- # playlist based on particular tags availability. As of [1, 2] master
- # playlist tags MUST NOT appear in a media playist and vice versa.
- # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
- # and MUST NOT appear in master playlist thus we can clearly detect media
- # playlist with this criterion.
- # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
- # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+ # master playlist tags MUST NOT appear in a media playlist and vice versa.
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+ # media playlist and MUST NOT appear in master playlist thus we can
+ # clearly detect media playlist with this criterion.
+
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
@@ -1345,52 +1874,77 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}]
- audio_in_video_stream = {}
- last_info = {}
- last_media = {}
+
+ groups = {}
+ last_stream_inf = {}
+
+ def extract_media(x_media_line):
+ media = parse_m3u8_attributes(x_media_line)
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+ if not (media_type and group_id and name):
+ return
+ groups.setdefault(group_id, []).append(media)
+ if media_type not in ('VIDEO', 'AUDIO'):
+ return
+ media_url = media.get('URI')
+ if media_url:
+ format_id = []
+ for v in (m3u8_id, group_id, name):
+ if v:
+ format_id.append(v)
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
+ 'language': media.get('LANGUAGE'),
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ formats.append(f)
+
+ def build_stream_name():
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ stream_name = last_stream_inf.get('NAME')
+ if stream_name:
+ return stream_name
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+ # from corresponding rendition group
+ stream_group_id = last_stream_inf.get('VIDEO')
+ if not stream_group_id:
+ return
+ stream_group = groups.get(stream_group_id)
+ if not stream_group:
+ return stream_group_id
+ rendition = stream_group[0]
+ return rendition.get('NAME') or stream_group_id
+
+ # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
+ # chance to detect video only formats when EXT-X-STREAM-INF tags
+ # precede EXT-X-MEDIA tags in HLS manifest such as [3].
+ for line in m3u8_doc.splitlines():
+ if line.startswith('#EXT-X-MEDIA:'):
+ extract_media(line)
+
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = parse_m3u8_attributes(line)
- elif line.startswith('#EXT-X-MEDIA:'):
- media = parse_m3u8_attributes(line)
- media_type = media.get('TYPE')
- if media_type in ('VIDEO', 'AUDIO'):
- group_id = media.get('GROUP-ID')
- media_url = media.get('URI')
- if media_url:
- format_id = []
- for v in (group_id, media.get('NAME')):
- if v:
- format_id.append(v)
- f = {
- 'format_id': '-'.join(format_id),
- 'url': format_url(media_url),
- 'language': media.get('LANGUAGE'),
- 'ext': ext,
- 'protocol': entry_protocol,
- 'preference': preference,
- }
- if media_type == 'AUDIO':
- f['vcodec'] = 'none'
- if group_id and not audio_in_video_stream.get(group_id):
- audio_in_video_stream[group_id] = False
- formats.append(f)
- else:
- # When there is no URI in EXT-X-MEDIA let this tag's
- # data be used by regular URI lines below
- last_media = media
- if media_type == 'AUDIO' and group_id:
- audio_in_video_stream[group_id] = True
+ last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#') or not line.strip():
continue
else:
- tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+ tbr = float_or_none(
+ last_stream_inf.get('AVERAGE-BANDWIDTH')
+ or last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- # Despite specification does not mention NAME attribute for
- # EXT-X-STREAM-INF it still sometimes may be present
- stream_name = last_info.get('NAME') or last_media.get('NAME')
+ stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
@@ -1400,14 +1954,14 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
- 'manifest_url': manifest_url,
+ 'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
- 'fps': float_or_none(last_info.get('FRAME-RATE')),
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
- resolution = last_info.get('RESOLUTION')
+ resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
@@ -1423,13 +1977,39 @@ class InfoExtractor(object):
'vbr': vbr,
'abr': abr,
})
- f.update(parse_codecs(last_info.get('CODECS')))
- if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
- # TODO: update acodec for audio only formats with the same GROUP-ID
- f['acodec'] = 'none'
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing an audio group it represents a complete
+ # (with audio and video) format. So, for such cases we will
+ # ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
formats.append(f)
- last_info = {}
- last_media = {}
+
+ # for DailyMotion
+ progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
+ if progressive_uri:
+ http_f = f.copy()
+ del http_f['manifest_url']
+ http_f.update({
+ 'format_id': f['format_id'].replace('hls-', 'http-'),
+ 'protocol': 'http',
+ 'url': progressive_uri,
+ })
+ formats.append(http_f)
+
+ last_stream_inf = {}
return formats
@staticmethod
@@ -1576,9 +2156,7 @@ class InfoExtractor(object):
'height': height,
})
formats.extend(m3u8_formats)
- continue
-
- if src_ext == 'f4m':
+ elif src_ext == 'f4m':
f4m_url = src_url
if not f4m_params:
f4m_params = {
@@ -1588,9 +2166,13 @@ class InfoExtractor(object):
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse_urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
- continue
-
- if src_url.startswith('http') and self._is_valid_url(src, video_id):
+ elif src_ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src_url, video_id, mpd_id='dash', fatal=False))
+ elif re.search(r'\.ism/[Mm]anifest', src_url):
+ formats.extend(self._extract_ism_formats(
+ src_url, video_id, ism_id='mss', fatal=False))
+ elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'url': src_url,
@@ -1601,7 +2183,6 @@ class InfoExtractor(object):
'width': width,
'height': height,
})
- continue
return formats
@@ -1621,22 +2202,24 @@ class InfoExtractor(object):
})
return subtitles
- def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+ def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
xspf = self._download_xml(
- playlist_url, playlist_id, 'Downloading xpsf playlist',
+ xspf_url, playlist_id, 'Downloading xpsf playlist',
'Unable to download xspf manifest', fatal=fatal)
if xspf is False:
return []
- return self._parse_xspf(xspf, playlist_id)
+ return self._parse_xspf(
+ xspf, playlist_id, xspf_url=xspf_url,
+ xspf_base_url=base_url(xspf_url))
- def _parse_xspf(self, playlist, playlist_id):
+ def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
NS_MAP = {
'xspf': 'http://xspf.org/ns/0/',
's1': 'http://static.streamone.nl/player/ns/0',
}
entries = []
- for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+ for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
title = xpath_text(
track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
description = xpath_text(
@@ -1646,12 +2229,18 @@ class InfoExtractor(object):
duration = float_or_none(
xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
- formats = [{
- 'url': location.text,
- 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
- 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
- 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
- } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+ formats = []
+ for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
+ format_url = urljoin(xspf_base_url, location.text)
+ if not format_url:
+ continue
+ formats.append({
+ 'url': format_url,
+ 'manifest_url': xspf_url,
+ 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+ 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+ 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+ })
self._sort_formats(formats)
entries.append({
@@ -1664,22 +2253,46 @@ class InfoExtractor(object):
})
return entries
- def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
- res = self._download_webpage_handle(
- mpd_url, video_id,
- note=note or 'Downloading MPD manifest',
- errnote=errnote or 'Failed to download MPD manifest',
- fatal=fatal)
- if res is False:
- return []
- mpd, urlh = res
- mpd_base_url = base_url(urlh.geturl())
+ def _extract_mpd_formats(self, *args, **kwargs):
+ fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('DASH')
+ return fmts
+
+ def _extract_mpd_formats_and_subtitles(
+ self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
+ fatal=True, data=None, headers=None, query=None):
- return self._parse_mpd_formats(
- compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
- formats_dict=formats_dict, mpd_url=mpd_url)
+ # TODO: or not? param not yet implemented
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
- def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
+ res = self._download_xml_handle(
+ mpd_url, video_id,
+ note='Downloading MPD manifest' if note is None else note,
+ errnote='Failed to download MPD manifest' if errnote is None else errnote,
+ fatal=fatal, data=data, headers=headers or {}, query=query or {})
+ if res is False:
+ return [], {}
+ mpd_doc, urlh = res
+ if mpd_doc is None:
+ return [], {}
+
+ # We could have been redirected to a new url when we retrieved our mpd file.
+ mpd_url = urlh.geturl()
+ mpd_base_url = base_url(mpd_url)
+
+ return self._parse_mpd_formats_and_subtitles(
+ mpd_doc, mpd_id, mpd_base_url, mpd_url)
+
+ def _parse_mpd_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('DASH')
+ return fmts
+
+ def _parse_mpd_formats_and_subtitles(
+ self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
"""
Parse formats from MPD manifest.
References:
@@ -1687,8 +2300,10 @@ class InfoExtractor(object):
http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
"""
- if mpd_doc.get('type') == 'dynamic':
- return []
+ # TODO: param not yet implemented: default like previous yt-dl logic
+ if not self.get_param('dynamic_mpd', False):
+ if mpd_doc.get('type') == 'dynamic':
+ return [], {}
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
@@ -1698,8 +2313,24 @@ class InfoExtractor(object):
def is_drm_protected(element):
return element.find(_add_ns('ContentProtection')) is not None
+ from ..utils import YoutubeDLHandler
+ fix_path = YoutubeDLHandler._fix_path
+
+ def resolve_base_url(element, parent_base_url=None):
+ # TODO: use native XML traversal when ready
+ b_url = traverse_obj(element, (
+ T(lambda e: e.find(_add_ns('BaseURL')).text)))
+ if parent_base_url and b_url:
+ if not parent_base_url[-1] in ('/', ':'):
+ parent_base_url += '/'
+ b_url = compat_urlparse.urljoin(parent_base_url, b_url)
+ if b_url:
+ b_url = fix_path(b_url)
+ return b_url or parent_base_url
+
def extract_multisegment_info(element, ms_parent_info):
ms_info = ms_parent_info.copy()
+ base_url = ms_info['base_url'] = resolve_base_url(element, ms_info.get('base_url'))
# As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
# common attributes and elements. We will only extract relevant
@@ -1728,20 +2359,32 @@ class InfoExtractor(object):
ms_info['timescale'] = int(timescale)
segment_duration = source.get('duration')
if segment_duration:
- ms_info['segment_duration'] = int(segment_duration)
+ ms_info['segment_duration'] = float(segment_duration)
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
if initialization is not None:
- ms_info['initialization_url'] = initialization.attrib['sourceURL']
+ ms_info['initialization_url'] = initialization.get('sourceURL') or base_url
+ initialization_url_range = initialization.get('range')
+ if initialization_url_range:
+ ms_info['initialization_url_range'] = initialization_url_range
segment_list = element.find(_add_ns('SegmentList'))
if segment_list is not None:
extract_common(segment_list)
extract_Initialization(segment_list)
segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
- if segment_urls_e:
- ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
+ segment_urls = traverse_obj(segment_urls_e, (
+ Ellipsis, T(lambda e: e.attrib), 'media'))
+ if segment_urls:
+ ms_info['segment_urls'] = segment_urls
+ segment_urls_range = traverse_obj(segment_urls_e, (
+ Ellipsis, T(lambda e: e.attrib), 'mediaRange',
+ T(lambda r: re.findall(r'^\d+-\d+$', r)), 0))
+ if segment_urls_range:
+ ms_info['segment_urls_range'] = segment_urls_range
+ if not segment_urls:
+ ms_info['segment_urls'] = [base_url for _ in segment_urls_range]
else:
segment_template = element.find(_add_ns('SegmentTemplate'))
if segment_template is not None:
@@ -1757,17 +2400,20 @@ class InfoExtractor(object):
return ms_info
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
- formats = []
+ formats, subtitles = [], {}
+ stream_numbers = collections.defaultdict(int)
+ mpd_base_url = resolve_base_url(mpd_doc, mpd_base_url or mpd_url)
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
'start_number': 1,
'timescale': 1,
+ 'base_url': mpd_base_url,
})
for adaptation_set in period.findall(_add_ns('AdaptationSet')):
if is_drm_protected(adaptation_set):
continue
- adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
+ adaptation_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
for representation in adaptation_set.findall(_add_ns('Representation')):
if is_drm_protected(representation):
continue
@@ -1775,175 +2421,278 @@ class InfoExtractor(object):
representation_attrib.update(representation.attrib)
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
mime_type = representation_attrib['mimeType']
- content_type = mime_type.split('/')[0]
- if content_type == 'text':
- # TODO implement WebVTT downloading
- pass
- elif content_type in ('video', 'audio'):
- base_url = ''
- for element in (representation, adaptation_set, period, mpd_doc):
- base_url_e = element.find(_add_ns('BaseURL'))
- if base_url_e is not None:
- base_url = base_url_e.text + base_url
- if re.match(r'^https?://', base_url):
- break
- if mpd_base_url and not re.match(r'^https?://', base_url):
- if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
- mpd_base_url += '/'
- base_url = mpd_base_url + base_url
- representation_id = representation_attrib.get('id')
- lang = representation_attrib.get('lang')
- url_el = representation.find(_add_ns('BaseURL'))
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
- bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ content_type = representation_attrib.get('contentType') or mime_type.split('/')[0]
+ codec_str = representation_attrib.get('codecs', '')
+ # Some kind of binary subtitle found in some youtube livestreams
+ if mime_type == 'application/x-rawcc':
+ codecs = {'scodec': codec_str}
+ else:
+ codecs = parse_codecs(codec_str)
+ if content_type not in ('video', 'audio', 'text'):
+ if mime_type == 'image/jpeg':
+ content_type = mime_type
+ elif codecs.get('vcodec', 'none') != 'none':
+ content_type = 'video'
+ elif codecs.get('acodec', 'none') != 'none':
+ content_type = 'audio'
+ elif codecs.get('scodec', 'none') != 'none':
+ content_type = 'text'
+ elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'):
+ content_type = 'text'
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+ continue
+
+ representation_id = representation_attrib.get('id')
+ lang = representation_attrib.get('lang')
+ url_el = representation.find(_add_ns('BaseURL'))
+ filesize = int_or_none(url_el.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+ bandwidth = int_or_none(representation_attrib.get('bandwidth'))
+ format_id = join_nonempty(representation_id or content_type, mpd_id)
+ if content_type in ('video', 'audio'):
f = {
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
- 'url': base_url,
'manifest_url': mpd_url,
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
- 'tbr': int_or_none(bandwidth, 1000),
+ 'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
+ 'container': mimetype2ext(mime_type) + '_dash',
+ }
+ f.update(codecs)
+ elif content_type == 'text':
+ f = {
+ 'ext': mimetype2ext(mime_type),
+ 'manifest_url': mpd_url,
+ 'filesize': filesize,
+ }
+ elif content_type == 'image/jpeg':
+ # See test case in VikiIE
+ # https://www.viki.com/videos/1175236v-choosing-spouse-by-lottery-episode-1
+ f = {
+ 'format_id': format_id,
+ 'ext': 'mhtml',
+ 'manifest_url': mpd_url,
+ 'format_note': 'DASH storyboards (jpeg)',
+ 'acodec': 'none',
+ 'vcodec': 'none',
+ }
+ if is_drm_protected(adaptation_set) or is_drm_protected(representation):
+ f['has_drm'] = True
+ representation_ms_info = extract_multisegment_info(representation, adaptation_set_ms_info)
+
+ def prepare_template(template_name, identifiers):
+ tmpl = representation_ms_info[template_name]
+ # First of, % characters outside $...$ templates
+ # must be escaped by doubling for proper processing
+ # by % operator string formatting used further (see
+ # https://github.com/ytdl-org/youtube-dl/issues/16867).
+ t = ''
+ in_template = False
+ for c in tmpl:
+ t += c
+ if c == '$':
+ in_template = not in_template
+ elif c == '%' and not in_template:
+ t += c
+ # Next, $...$ templates are translated to their
+ # %(...) counterparts to be used with % operator
+ t = t.replace('$RepresentationID$', representation_id)
+ t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
+ t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
+ t.replace('$$', '$')
+ return t
+
+ # @initialization is a regular template like @media one
+ # so it should be handled just the same way (see
+ # https://github.com/ytdl-org/youtube-dl/issues/11605)
+ if 'initialization' in representation_ms_info:
+ initialization_template = prepare_template(
+ 'initialization',
+ # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
+ # $Time$ shall not be included for @initialization thus
+ # only $Bandwidth$ remains
+ ('Bandwidth', ))
+ representation_ms_info['initialization_url'] = initialization_template % {
+ 'Bandwidth': bandwidth,
}
- f.update(parse_codecs(representation_attrib.get('codecs')))
- representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
-
- def prepare_template(template_name, identifiers):
- t = representation_ms_info[template_name]
- t = t.replace('$RepresentationID$', representation_id)
- t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
- t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
- t.replace('$$', '$')
- return t
-
- # @initialization is a regular template like @media one
- # so it should be handled just the same way (see
- # https://github.com/rg3/youtube-dl/issues/11605)
- if 'initialization' in representation_ms_info:
- initialization_template = prepare_template(
- 'initialization',
- # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
- # $Time$ shall not be included for @initialization thus
- # only $Bandwidth$ remains
- ('Bandwidth', ))
- representation_ms_info['initialization_url'] = initialization_template % {
- 'Bandwidth': bandwidth,
- }
-
- if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
-
- media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
-
- # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
- # can't be used at the same time
- if '%(Number' in media_template and 's' not in representation_ms_info:
- segment_duration = None
- if 'total_number' not in representation_ms_info and 'segment_duration':
- segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
- representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
- representation_ms_info['fragments'] = [{
- 'url': media_template % {
- 'Number': segment_number,
- 'Bandwidth': bandwidth,
- },
- 'duration': segment_duration,
- } for segment_number in range(
- representation_ms_info['start_number'],
- representation_ms_info['total_number'] + representation_ms_info['start_number'])]
- else:
- # $Number*$ or $Time$ in media template with S list available
- # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
- # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
- representation_ms_info['fragments'] = []
- segment_time = 0
- segment_d = None
- segment_number = representation_ms_info['start_number']
-
- def add_segment_url():
- segment_url = media_template % {
- 'Time': segment_time,
- 'Bandwidth': bandwidth,
- 'Number': segment_number,
- }
- representation_ms_info['fragments'].append({
- 'url': segment_url,
- 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
- })
- for num, s in enumerate(representation_ms_info['s']):
- segment_time = s.get('t') or segment_time
- segment_d = s['d']
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
+ def calc_segment_duration():
+ return float_or_none(
+ representation_ms_info['segment_duration'],
+ representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
+
+ if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
+
+ media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
+
+ # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+ # can't be used at the same time
+ if '%(Number' in media_template and 's' not in representation_ms_info:
+ segment_duration = None
+ if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
+ segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+ representation_ms_info['total_number'] = int(math.ceil(
+ float_or_none(period_duration, segment_duration, default=0)))
+ representation_ms_info['fragments'] = [{
+ media_location_key: media_template % {
+ 'Number': segment_number,
+ 'Bandwidth': bandwidth,
+ },
+ 'duration': segment_duration,
+ } for segment_number in range(
+ representation_ms_info['start_number'],
+ representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+ else:
+ # $Number*$ or $Time$ in media template with S list available
+ # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+ # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
+ representation_ms_info['fragments'] = []
+ segment_time = 0
+ segment_d = None
+ segment_number = representation_ms_info['start_number']
+
+ def add_segment_url():
+ segment_url = media_template % {
+ 'Time': segment_time,
+ 'Bandwidth': bandwidth,
+ 'Number': segment_number,
+ }
+ representation_ms_info['fragments'].append({
+ media_location_key: segment_url,
+ 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+ })
+
+ for num, s in enumerate(representation_ms_info['s']):
+ segment_time = s.get('t') or segment_time
+ segment_d = s['d']
+ add_segment_url()
+ segment_number += 1
+ for r in range(s.get('r', 0)):
+ segment_time += segment_d
add_segment_url()
segment_number += 1
- for r in range(s.get('r', 0)):
- segment_time += segment_d
- add_segment_url()
- segment_number += 1
- segment_time += segment_d
- elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+ segment_time += segment_d
+ elif 'segment_urls' in representation_ms_info:
+ fragments = []
+ if 's' in representation_ms_info:
# No media template
# Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
# or any YouTube dashsegments video
- fragments = []
segment_index = 0
timescale = representation_ms_info['timescale']
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
fragments.append({
- 'url': representation_ms_info['segment_urls'][segment_index],
+ location_key(segment_uri): segment_uri,
'duration': duration,
})
segment_index += 1
- representation_ms_info['fragments'] = fragments
- # NB: MPD manifest may contain direct URLs to unfragmented media.
- # No fragments key is present in this case.
- if 'fragments' in representation_ms_info:
- f.update({
- 'fragments': [],
- 'protocol': 'http_dash_segments',
- })
- if 'initialization_url' in representation_ms_info:
- initialization_url = representation_ms_info['initialization_url']
- if not f.get('url'):
- f['url'] = initialization_url
- f['fragments'].append({'url': initialization_url})
- f['fragments'].extend(representation_ms_info['fragments'])
- for fragment in f['fragments']:
- fragment['url'] = urljoin(base_url, fragment['url'])
- try:
- existing_format = next(
- fo for fo in formats
- if fo['format_id'] == representation_id)
- except StopIteration:
- full_info = formats_dict.get(representation_id, {}).copy()
- full_info.update(f)
- formats.append(full_info)
+ elif 'segment_urls_range' in representation_ms_info:
+ # Segment URLs with mediaRange
+ # Example: https://kinescope.io/200615537/master.mpd
+ # https://github.com/ytdl-org/youtube-dl/issues/30235
+ # or any mpd generated with Bento4 `mp4dash --no-split --use-segment-list`
+ segment_duration = calc_segment_duration()
+ for segment_url, segment_url_range in zip(
+ representation_ms_info['segment_urls'], representation_ms_info['segment_urls_range']):
+ fragments.append({
+ location_key(segment_url): segment_url,
+ 'range': segment_url_range,
+ 'duration': segment_duration,
+ })
else:
- existing_format.update(f)
+ # Segment URLs with no SegmentTimeline
+ # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
+ # https://github.com/ytdl-org/youtube-dl/pull/14844
+ segment_duration = calc_segment_duration()
+ for segment_url in representation_ms_info['segment_urls']:
+ fragments.append({
+ location_key(segment_url): segment_url,
+ 'duration': segment_duration,
+ })
+ representation_ms_info['fragments'] = fragments
+
+ # If there is a fragments key available then we correctly recognized fragmented media.
+ # Otherwise we will assume unfragmented media with direct access. Technically, such
+ # assumption is not necessarily correct since we may simply have no support for
+ # some forms of fragmented media renditions yet, but for now we'll use this fallback.
+ if 'fragments' in representation_ms_info:
+ base_url = representation_ms_info['base_url']
+ f.update({
+ # NB: mpd_url may be empty when MPD manifest is parsed from a string
+ 'url': mpd_url or base_url,
+ 'fragment_base_url': base_url,
+ 'fragments': [],
+ 'protocol': 'http_dash_segments',
+ })
+ if 'initialization_url' in representation_ms_info and 'initialization_url_range' in representation_ms_info:
+ # Initialization URL with range (accompanied by Segment URLs with mediaRange above)
+ # https://github.com/ytdl-org/youtube-dl/issues/30235
+ initialization_url = representation_ms_info['initialization_url']
+ f['fragments'].append({
+ location_key(initialization_url): initialization_url,
+ 'range': representation_ms_info['initialization_url_range'],
+ })
+ elif 'initialization_url' in representation_ms_info:
+ initialization_url = representation_ms_info['initialization_url']
+ if not f.get('url'):
+ f['url'] = initialization_url
+ f['fragments'].append({location_key(initialization_url): initialization_url})
+ elif 'initialization_url_range' in representation_ms_info:
+ # no Initialization URL but range (accompanied by no Segment URLs but mediaRange above)
+ # https://github.com/ytdl-org/youtube-dl/issues/27575
+ f['fragments'].append({
+ location_key(base_url): base_url,
+ 'range': representation_ms_info['initialization_url_range'],
+ })
+ f['fragments'].extend(representation_ms_info['fragments'])
+ if not period_duration:
+ period_duration = sum(traverse_obj(representation_ms_info, (
+ 'fragments', Ellipsis, 'duration', T(float_or_none))))
else:
- self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
- return formats
+ # Assuming direct URL to unfragmented media.
+ f['url'] = representation_ms_info['base_url']
+ if content_type in ('video', 'audio', 'image/jpeg'):
+ f['manifest_stream_number'] = stream_numbers[f['url']]
+ stream_numbers[f['url']] += 1
+ formats.append(f)
+ elif content_type == 'text':
+ subtitles.setdefault(lang or 'und', []).append(f)
+ return formats, subtitles
- def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
- res = self._download_webpage_handle(
+ def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ res = self._download_xml_handle(
ism_url, video_id,
note=note or 'Downloading ISM manifest',
errnote=errnote or 'Failed to download ISM manifest',
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
- ism, urlh = res
+ ism_doc, urlh = res
+ if ism_doc is None:
+ return []
- return self._parse_ism_formats(
- compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
+ return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ """
+ Parse formats from ISM manifest.
+ References:
+ 1. [MS-SSTR]: Smooth Streaming Protocol,
+ https://msdn.microsoft.com/en-us/library/ff469518.aspx
+ """
if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
return []
@@ -1959,14 +2708,17 @@ class InfoExtractor(object):
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
stream_name = stream.get('Name')
for track in stream.findall('QualityLevel'):
- fourcc = track.get('FourCC')
+ fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
# TODO: add support for WVC1 and WMAP
if fourcc not in ('H264', 'AVC1', 'AACL'):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
- width = int_or_none(track.get('MaxWidth'))
- height = int_or_none(track.get('MaxHeight'))
+ # [1] does not mention Width and Height attributes. However,
+ # they're often present while MaxWidth and MaxHeight are
+ # missing, so should be used as fallbacks
+ width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+ height = int_or_none(track.get('MaxHeight') or track.get('Height'))
sampling_rate = int_or_none(track.get('SamplingRate'))
track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
@@ -2030,8 +2782,8 @@ class InfoExtractor(object):
return formats
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
- def absolute_url(video_url):
- return compat_urlparse.urljoin(base_url, video_url)
+ def absolute_url(item_url):
+ return urljoin(base_url, item_url)
def parse_content_type(content_type):
if not content_type:
@@ -2044,57 +2796,96 @@ class InfoExtractor(object):
return f
return {}
- def _media_formats(src, cur_media_type):
+ def _media_formats(src, cur_media_type, type_info=None):
+ type_info = type_info or {}
full_url = absolute_url(src)
- ext = determine_ext(full_url)
+ ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
- preference=preference)
+ preference=preference, fatal=False)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
- full_url, video_id, mpd_id=mpd_id)
+ full_url, video_id, mpd_id=mpd_id, fatal=False)
else:
is_plain_url = True
formats = [{
'url': full_url,
'vcodec': 'none' if cur_media_type == 'audio' else None,
+ 'ext': ext,
}]
return is_plain_url, formats
entries = []
- media_tags = [(media_tag, media_type, '')
- for media_tag, media_type
- in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+ # amp-video and amp-audio are very similar to their HTML5 counterparts
+ # so we wll include them right here (see
+ # https://www.ampproject.org/docs/reference/components/amp-video)
+ # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
+ _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video(?:-js)?|audio)'
+ media_tags = [(media_tag, media_tag_name, media_type, '')
+ for media_tag, media_tag_name, media_type
+ in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
- # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+ # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
- r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
- for media_tag, media_type, media_content in media_tags:
+ r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
+ for media_tag, _, media_type, media_content in media_tags:
media_info = {
'formats': [],
'subtitles': {},
}
media_attributes = extract_attributes(media_tag)
- src = media_attributes.get('src')
+ src = strip_or_none(media_attributes.get('src'))
if src:
- _, formats = _media_formats(src, media_type)
+ f = parse_content_type(media_attributes.get('type'))
+ _, formats = _media_formats(src, media_type, f)
media_info['formats'].extend(formats)
- media_info['thumbnail'] = media_attributes.get('poster')
+ media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
if media_content:
for source_tag in re.findall(r'<source[^>]+>', media_content):
- source_attributes = extract_attributes(source_tag)
- src = source_attributes.get('src')
+ s_attr = extract_attributes(source_tag)
+ # data-video-src and data-src are non standard but seen
+ # several times in the wild
+ src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
if not src:
continue
- is_plain_url, formats = _media_formats(src, media_type)
+ f = parse_content_type(s_attr.get('type'))
+ is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
- f = parse_content_type(source_attributes.get('type'))
+ # width, height, res, label and title attributes are
+ # all not standard but seen several times in the wild
+ labels = [
+ s_attr.get(lbl)
+ for lbl in ('label', 'title')
+ if str_or_none(s_attr.get(lbl))
+ ]
+ width = int_or_none(s_attr.get('width'))
+ height = (int_or_none(s_attr.get('height'))
+ or int_or_none(s_attr.get('res')))
+ if not width or not height:
+ for lbl in labels:
+ resolution = parse_resolution(lbl)
+ if not resolution:
+ continue
+ width = width or resolution.get('width')
+ height = height or resolution.get('height')
+ for lbl in labels:
+ tbr = parse_bitrate(lbl)
+ if tbr:
+ break
+ else:
+ tbr = None
+ f.update({
+ 'width': width,
+ 'height': height,
+ 'tbr': tbr,
+ 'format_id': s_attr.get('label') or s_attr.get('title'),
+ })
f.update(formats[0])
media_info['formats'].append(f)
else:
@@ -2103,21 +2894,31 @@ class InfoExtractor(object):
track_attributes = extract_attributes(track_tag)
kind = track_attributes.get('kind')
if not kind or kind in ('subtitles', 'captions'):
- src = track_attributes.get('src')
+ src = strip_or_none(track_attributes.get('src'))
if not src:
continue
lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
media_info['subtitles'].setdefault(lang, []).append({
'url': absolute_url(src),
})
+ for f in media_info['formats']:
+ f.setdefault('http_headers', {})['Referer'] = base_url
if media_info['formats'] or media_info['subtitles']:
entries.append(media_info)
return entries
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
+ signed = 'hdnea=' in manifest_url
+ if not signed:
+ # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
+ manifest_url = re.sub(
+ r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
+ '', manifest_url).strip('?')
+
formats = []
+
hdcore_sign = 'hdcore=3.7.0'
- f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
hds_host = hosts.get('hds')
if hds_host:
f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
@@ -2128,36 +2929,71 @@ class InfoExtractor(object):
for entry in f4m_formats:
entry.update({'extra_param_to_segment_url': hdcore_sign})
formats.extend(f4m_formats)
+
m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
hls_host = hosts.get('hls')
if hls_host:
m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
- formats.extend(self._extract_m3u8_formats(
+ m3u8_formats = self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
+ m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_formats)
+
+ http_host = hosts.get('http')
+ if http_host and m3u8_formats and not signed:
+ REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
+ qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
+ qualities_length = len(qualities)
+ if len(m3u8_formats) in (qualities_length, qualities_length + 1):
+ i = 0
+ for f in m3u8_formats:
+ if f['vcodec'] != 'none':
+ for protocol in ('http', 'https'):
+ http_f = f.copy()
+ del http_f['manifest_url']
+ http_url = re.sub(
+ REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
+ http_f.update({
+ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
+ 'url': http_url,
+ 'protocol': protocol,
+ })
+ formats.append(http_f)
+ i += 1
+
return formats
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
+ query = compat_urlparse.urlparse(url).query
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
- url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
- http_base_url = 'http' + url_base
+ mobj = re.search(
+ r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
+ url_base = mobj.group('url')
+ http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
formats = []
+
+ def manifest_url(manifest):
+ m_url = '%s/%s' % (http_base_url, manifest)
+ if query:
+ m_url += '?%s' % query
+ return m_url
+
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
- http_base_url + '/playlist.m3u8', video_id, 'mp4',
+ manifest_url('playlist.m3u8'), video_id, 'mp4',
m3u8_entry_protocol, m3u8_id='hls', fatal=False))
if 'f4m' not in skip_protocols:
formats.extend(self._extract_f4m_formats(
- http_base_url + '/manifest.f4m',
+ manifest_url('manifest.f4m'),
video_id, f4m_id='hds', fatal=False))
if 'dash' not in skip_protocols:
formats.extend(self._extract_mpd_formats(
- http_base_url + '/manifest.mpd',
+ manifest_url('manifest.mpd'),
video_id, mpd_id='dash', fatal=False))
if re.search(r'(?:/smil:|\.smil)', url_base):
if 'smil' not in skip_protocols:
rtmp_formats = self._extract_smil_formats(
- http_base_url + '/jwplayer.smil',
+ manifest_url('jwplayer.smil'),
video_id, fatal=False)
for rtmp_format in rtmp_formats:
rtsp_format = rtmp_format.copy()
@@ -2174,38 +3010,39 @@ class InfoExtractor(object):
for protocol in ('rtmp', 'rtsp'):
if protocol not in skip_protocols:
formats.append({
- 'url': protocol + url_base,
+ 'url': '%s:%s' % (protocol, url_base),
'format_id': protocol,
'protocol': protocol,
})
return formats
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
- mobj = re.search(
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
- webpage)
- if mobj:
- try:
- jwplayer_data = self._parse_json(mobj.group('options'),
- video_id=video_id,
- transform_source=transform_source)
- except ExtractorError:
- pass
- else:
- if isinstance(jwplayer_data, dict):
- return jwplayer_data
+ return self._search_json(
+ r'''(?<!-)\bjwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?:(?!</script>).)*?\.\s*(?:setup\s*\(|(?P<load>load)\s*\(\s*\[)''',
+ webpage, 'JWPlayer data', video_id,
+ # must be a {...} or sequence, ending
+ contains_pattern=r'\{[\s\S]*}(?(load)(?:\s*,\s*\{[\s\S]*})*)', end_pattern=r'(?(load)\]|\))',
+ transform_source=transform_source, default=None)
def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
- jwplayer_data = self._find_jwplayer_data(
- webpage, video_id, transform_source=js_to_json)
- return self._parse_jwplayer_data(
- jwplayer_data, video_id, *args, **kwargs)
+ # allow passing `transform_source` through to _find_jwplayer_data()
+ transform_source = kwargs.pop('transform_source', None)
+ kwfind = compat_kwargs({'transform_source': transform_source}) if transform_source else {}
+
+ jwplayer_data = self._find_jwplayer_data(webpage, video_id, **kwfind)
+
+ return self._parse_jwplayer_data(jwplayer_data, video_id, *args, **kwargs)
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ flat_pl = try_get(jwplayer_data, lambda x: x.get('playlist') or True)
+ if flat_pl is None:
+ # not even a dict
+ return []
+
# JWPlayer backward compatibility: flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
- if 'playlist' not in jwplayer_data:
+ if flat_pl is True:
jwplayer_data = {'playlist': [jwplayer_data]}
entries = []
@@ -2226,31 +3063,45 @@ class InfoExtractor(object):
formats = self._parse_jwplayer_formats(
video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
- self._sort_formats(formats)
subtitles = {}
- tracks = video_data.get('tracks')
- if tracks and isinstance(tracks, list):
- for track in tracks:
- if track.get('kind') != 'captions':
- continue
- track_url = urljoin(base_url, track.get('file'))
- if not track_url:
- continue
- subtitles.setdefault(track.get('label') or 'en', []).append({
- 'url': self._proto_relative_url(track_url)
- })
+ for track in traverse_obj(video_data, (
+ 'tracks', lambda _, t: t.get('kind').lower() in ('captions', 'subtitles'))):
+ track_url = urljoin(base_url, track.get('file'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('label') or 'en', []).append({
+ 'url': self._proto_relative_url(track_url)
+ })
- entries.append({
+ entry = {
'id': this_video_id,
- 'title': video_data['title'] if require_title else video_data.get('title'),
- 'description': video_data.get('description'),
- 'thumbnail': self._proto_relative_url(video_data.get('image')),
+ 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
+ 'description': clean_html(video_data.get('description')),
+ 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
- 'formats': formats,
- })
+ 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
+ 'genre': clean_html(video_data.get('genre')),
+ 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
+ 'season_number': int_or_none(video_data.get('season')),
+ 'episode_number': int_or_none(video_data.get('episode')),
+ 'release_year': int_or_none(video_data.get('releasedate')),
+ 'age_limit': int_or_none(video_data.get('age_restriction')),
+ }
+ # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
+ if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
+ entry.update({
+ '_type': 'url_transparent',
+ 'url': formats[0]['url'],
+ })
+ else:
+ # avoid exception in case of only sttls
+ if formats:
+ self._sort_formats(formats)
+ entry['formats'] = formats
+ entries.append(entry)
if len(entries) == 1:
return entries[0]
else:
@@ -2258,18 +3109,23 @@ class InfoExtractor(object):
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ urls = set()
formats = []
for source in jwplayer_sources_data:
- source_url = self._proto_relative_url(source['file'])
- if base_url:
- source_url = compat_urlparse.urljoin(base_url, source_url)
+ if not isinstance(source, dict):
+ continue
+ source_url = urljoin(
+ base_url, self._proto_relative_url(source.get('file')))
+ if not source_url or source_url in urls:
+ continue
+ urls.add(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
- if source_type == 'hls' or ext == 'm3u8':
+ if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False))
- elif ext == 'mpd':
+ elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil':
@@ -2284,26 +3140,29 @@ class InfoExtractor(object):
'ext': ext,
})
else:
+ format_id = str_or_none(source.get('label'))
height = int_or_none(source.get('height'))
- if height is None:
+ if height is None and format_id:
# Often no height is provided but there is a label in
# format like "1080p", "720p SD", or 1080.
- height = int_or_none(self._search_regex(
- r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
- 'height', default=None))
+ height = parse_resolution(format_id).get('height')
a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
'height': height,
- 'tbr': int_or_none(source.get('bitrate')),
+ 'tbr': int_or_none(source.get('bitrate'), scale=1000),
+ 'filesize': int_or_none(source.get('filesize')),
'ext': ext,
}
+ if format_id:
+ a_format['format_id'] = format_id
+
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
# See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
# of jwplayer.flash.swf
rtmp_url_parts = re.split(
- r'((?:mp4|mp3|flv):)', source_url, 1)
+ r'((?:mp4|mp3|flv):)', source_url, maxsplit=1)
if len(rtmp_url_parts) == 3:
rtmp_url, prefix, play_path = rtmp_url_parts
a_format.update({
@@ -2330,7 +3189,7 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
def _float(self, v, name, fatal=False, **kwargs):
@@ -2340,20 +3199,49 @@ class InfoExtractor(object):
if fatal:
raise ExtractorError(msg)
else:
- self._downloader.report_warning(msg)
+ self.report_warning(msg)
return res
- def _set_cookie(self, domain, name, value, expire_time=None):
- cookie = compat_cookiejar.Cookie(
- 0, name, value, None, None, domain, None,
- None, '/', True, False, expire_time, '', None, None, None)
- self._downloader.cookiejar.set_cookie(cookie)
+ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
+ path='/', secure=False, discard=False, rest={}, **kwargs):
+ cookie = compat_cookiejar_Cookie(
+ 0, name, value, port, port is not None, domain, True,
+ domain.startswith('.'), path, True, secure, expire_time,
+ discard, None, None, rest)
+ self.cookiejar.set_cookie(cookie)
def _get_cookies(self, url):
- """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+ """ Return a compat_cookies_SimpleCookie with the cookies for the url """
req = sanitized_Request(url)
- self._downloader.cookiejar.add_cookie_header(req)
- return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+ self.cookiejar.add_cookie_header(req)
+ return compat_cookies_SimpleCookie(req.get_header('Cookie'))
+
+ def _apply_first_set_cookie_header(self, url_handle, cookie):
+ """
+ Apply first Set-Cookie header instead of the last. Experimental.
+
+ Some sites (e.g. [1-3]) may serve two cookies under the same name
+ in Set-Cookie header and expect the first (old) one to be set rather
+ than second (new). However, as of RFC6265 the newer one cookie
+ should be set into cookie store what actually happens.
+ We will workaround this issue by resetting the cookie to
+ the first one manually.
+ 1. https://new.vk.com/
+ 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
+ 3. https://learning.oreilly.com/
+ """
+ for header, cookies in url_handle.headers.items():
+ if header.lower() != 'set-cookie':
+ continue
+ if sys.version_info[0] >= 3:
+ cookies = cookies.encode('iso-8859-1')
+ cookies = cookies.decode('utf-8')
+ cookie_value = re.search(
+ r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
+ if cookie_value:
+ value, domain = cookie_value.groups()
+ self._set_cookie(domain, cookie, value)
+ break
def get_testcases(self, include_onlymatching=False):
t = getattr(self, '_TEST', None)
@@ -2385,8 +3273,8 @@ class InfoExtractor(object):
return not any_restricted
def extract_subtitles(self, *args, **kwargs):
- if (self._downloader.params.get('writesubtitles', False) or
- self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writesubtitles', False)
+ or self.get_param('listsubtitles')):
return self._get_subtitles(*args, **kwargs)
return {}
@@ -2403,16 +3291,24 @@ class InfoExtractor(object):
return ret
@classmethod
- def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
- """ Merge two subtitle dictionaries, language by language. """
- ret = dict(subtitle_dict1)
- for lang in subtitle_dict2:
- ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
- return ret
+ def _merge_subtitles(cls, subtitle_dict1, *subtitle_dicts, **kwargs):
+ """ Merge subtitle dictionaries, language by language. """
+
+ # ..., * , target=None
+ target = kwargs.get('target')
+ if target is None:
+ target = dict(subtitle_dict1)
+ else:
+ subtitle_dicts = (subtitle_dict1,) + subtitle_dicts
+
+ for subtitle_dict in subtitle_dicts:
+ for lang in subtitle_dict:
+ target[lang] = cls._merge_subtitle_items(target.get(lang, []), subtitle_dict[lang])
+ return target
def extract_automatic_captions(self, *args, **kwargs):
- if (self._downloader.params.get('writeautomaticsub', False) or
- self._downloader.params.get('listsubtitles')):
+ if (self.get_param('writeautomaticsub', False)
+ or self.get_param('listsubtitles')):
return self._get_automatic_captions(*args, **kwargs)
return {}
@@ -2420,9 +3316,9 @@ class InfoExtractor(object):
raise NotImplementedError('This method must be implemented by subclasses')
def mark_watched(self, *args, **kwargs):
- if (self._downloader.params.get('mark_watched', False) and
- (self._get_login_info()[0] is not None or
- self._downloader.params.get('cookiefile') is not None)):
+ if (self.get_param('mark_watched', False)
+ and (self._get_login_info()[0] is not None
+ or self.get_param('cookiefile') is not None)):
self._mark_watched(*args, **kwargs)
def _mark_watched(self, *args, **kwargs):
@@ -2430,7 +3326,7 @@ class InfoExtractor(object):
def geo_verification_headers(self):
headers = {}
- geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+ geo_verification_proxy = self.get_param('geo_verification_proxy')
if geo_verification_proxy:
headers['Ytdl-request-proxy'] = geo_verification_proxy
return headers
@@ -2441,6 +3337,29 @@ class InfoExtractor(object):
def _generic_title(self, url):
return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+ def _yes_playlist(self, playlist_id, video_id, *args, **kwargs):
+ # smuggled_data=None, *, playlist_label='playlist', video_label='video'
+ smuggled_data = args[0] if len(args) == 1 else kwargs.get('smuggled_data')
+ playlist_label = kwargs.get('playlist_label', 'playlist')
+ video_label = kwargs.get('video_label', 'video')
+
+ if not playlist_id or not video_id:
+ return not video_id
+
+ no_playlist = (smuggled_data or {}).get('force_noplaylist')
+ if no_playlist is not None:
+ return not no_playlist
+
+ video_id = '' if video_id is True else ' ' + video_id
+ noplaylist = self.get_param('noplaylist')
+ self.to_screen(
+ 'Downloading just the {0}{1} because of --no-playlist'.format(video_label, video_id)
+ if noplaylist else
+ 'Downloading {0}{1} - add --no-playlist to download just the {2}{3}'.format(
+ playlist_label, '' if playlist_id is True else ' ' + playlist_id,
+ video_label, video_id))
+ return not noplaylist
+
class SearchInfoExtractor(InfoExtractor):
"""