diff options
| author | Sergey M․ <dstftw@gmail.com> | 2017-11-04 22:10:55 +0700 | 
|---|---|---|
| committer | Sergey M․ <dstftw@gmail.com> | 2017-11-04 22:10:55 +0700 | 
| commit | 48107c198bd76e611e3d4c2486cdc5403829a05a (patch) | |
| tree | d9f68f3661d24e7a77996a5ba1f7b3792276b079 | |
| parent | cd670befc4c823a38a88fffbaa6c493e539dd79d (diff) | |
[f4m] Prefer baseURL for relative URLs (closes #14660)
| -rw-r--r-- | youtube_dl/downloader/f4m.py | 25 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 14 | 
2 files changed, 24 insertions, 15 deletions
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index c8fde9a89..fdb80f42a 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -243,8 +243,17 @@ def remove_encrypted_media(media):                         media)) -def _add_ns(prop): -    return '{http://ns.adobe.com/f4m/1.0}%s' % prop +def _add_ns(prop, ver=1): +    return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): +    base_url = xpath_text( +        manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], +        'base URL', default=None) +    if base_url: +        base_url = base_url.strip() +    return base_url  class F4mFD(FragmentFD): @@ -330,13 +339,13 @@ class F4mFD(FragmentFD):              rate, media = list(filter(                  lambda f: int(f[0]) == requested_bitrate, formats))[0] -        base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) +        # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. +        man_base_url = get_base_url(doc) or man_url + +        base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url'])          bootstrap_node = doc.find(_add_ns('bootstrapInfo')) -        # From Adobe F4M 3.0 spec: -        # The <baseURL> element SHALL be the base URL for all relative -        # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said -        # URLs should be relative to the location of the containing document. -        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) +        boot_info, bootstrap_url = self._parse_bootstrap_node( +            bootstrap_node, man_base_url)          live = boot_info['live']          metadata_node = media.find(_add_ns('metadata'))          if metadata_node is not None: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a67ac4411..64fb869aa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,7 +29,10 @@ from ..compat import (      compat_urlparse,      compat_xml_parse_error,  ) -from ..downloader.f4m import remove_encrypted_media +from ..downloader.f4m import ( +    get_base_url, +    remove_encrypted_media, +)  from ..utils import (      NO_DEFAULT,      age_restricted, @@ -1239,11 +1242,8 @@ class InfoExtractor(object):          media_nodes = remove_encrypted_media(media_nodes)          if not media_nodes:              return formats -        base_url = xpath_text( -            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], -            'base URL', default=None) -        if base_url: -            base_url = base_url.strip() + +        manifest_base_url = get_base_url(manifest)          bootstrap_info = xpath_element(              manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1275,7 +1275,7 @@ class InfoExtractor(object):                      continue                  manifest_url = (                      media_url if media_url.startswith('http://') or media_url.startswith('https://') -                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) +                    else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))                  # If media_url is itself a f4m manifest do the recursive extraction                  # since bitrates in parent manifest (this one) and media_url manifest                  # may differ leading to inability to resolve the format by requested  | 
