diff options
| -rw-r--r-- | docs/supportedsites.md | 3 | ||||
| -rw-r--r-- | test/test_compat.py | 17 | ||||
| -rw-r--r-- | test/test_utils.py | 19 | ||||
| -rw-r--r-- | youtube_dl/compat.py | 39 | ||||
| -rw-r--r-- | youtube_dl/downloader/f4m.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/ard.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/bbc.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/bilibili.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/brightcove.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/common.py | 15 | ||||
| -rw-r--r-- | youtube_dl/extractor/crunchyroll.py | 4 | ||||
| -rw-r--r-- | youtube_dl/extractor/eitb.py | 95 | ||||
| -rw-r--r-- | youtube_dl/extractor/generic.py | 5 | ||||
| -rw-r--r-- | youtube_dl/extractor/mdr.py | 189 | ||||
| -rw-r--r-- | youtube_dl/extractor/mitele.py | 8 | ||||
| -rw-r--r-- | youtube_dl/extractor/vevo.py | 6 | ||||
| -rw-r--r-- | youtube_dl/extractor/videofyme.py | 40 | ||||
| -rw-r--r-- | youtube_dl/utils.py | 46 | ||||
| -rw-r--r-- | youtube_dl/version.py | 2 | 
19 files changed, 370 insertions, 144 deletions
| diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 03561b87d..805af14a0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -93,6 +93,7 @@   - **Clipsyndicate**   - **Cloudy**   - **Clubic** + - **Clyp**   - **cmt.com**   - **CNET**   - **CNN** @@ -281,7 +282,7 @@   - **macgamestore**: MacGameStore trailers   - **mailru**: Видео@Mail.Ru   - **Malemotion** - - **MDR** + - **MDR**: MDR.DE and KiKA   - **media.ccc.de**   - **metacafe**   - **Metacritic** diff --git a/test/test_compat.py b/test/test_compat.py index 4ee0dc99d..b6bfad05e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,8 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  from youtube_dl.utils import get_filesystem_encoding  from youtube_dl.compat import (      compat_getenv, +    compat_etree_fromstring,      compat_expanduser,      compat_shlex_split, +    compat_str,      compat_urllib_parse_unquote,      compat_urllib_parse_unquote_plus,  ) @@ -71,5 +73,20 @@ class TestCompat(unittest.TestCase):      def test_compat_shlex_split(self):          self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) +    def test_compat_etree_fromstring(self): +        xml = ''' +            <root foo="bar" spam="中文"> +                <normal>foo</normal> +                <chinese>中文</chinese> +                <foo><bar>spam</bar></foo> +            </root> +        ''' +        doc = compat_etree_fromstring(xml.encode('utf-8')) +        self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) +        self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) +        self.assertTrue(isinstance(doc.find('normal').text, compat_str)) +        self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) +        self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 0c34f0e55..01829f71e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -68,6 +68,9 @@ from youtube_dl.utils import (      cli_valueless_option,      cli_bool_option,  ) +from youtube_dl.compat import ( +    compat_etree_fromstring, +)  class TestUtil(unittest.TestCase): @@ -233,6 +236,7 @@ class TestUtil(unittest.TestCase):              unified_strdate('2/2/2015 6:47:40 PM', day_first=False),              '20150202')          self.assertEqual(unified_strdate('25-09-2014'), '20140925') +        self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None)      def test_find_xpath_attr(self):          testxml = '''<root> @@ -242,7 +246,7 @@ class TestUtil(unittest.TestCase):              <node x="b" y="d" />              <node x="" />          </root>''' -        doc = xml.etree.ElementTree.fromstring(testxml) +        doc = compat_etree_fromstring(testxml)          self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None)          self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) @@ -263,7 +267,7 @@ class TestUtil(unittest.TestCase):                  <url>http://server.com/download.mp3</url>              </media:song>          </root>''' -        doc = xml.etree.ElementTree.fromstring(testxml) +        doc = compat_etree_fromstring(testxml)          find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'}))          self.assertTrue(find('media:song') is not None)          self.assertEqual(find('media:song/media:author').text, 'The Author') @@ -275,9 +279,16 @@ class TestUtil(unittest.TestCase):          p = xml.etree.ElementTree.SubElement(div, 'p')          p.text = 'Foo'          self.assertEqual(xpath_element(doc, 'div/p'), p) +        self.assertEqual(xpath_element(doc, ['div/p']), p) +        self.assertEqual(xpath_element(doc, ['div/bar', 'div/p']), p)          self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') +        self.assertEqual(xpath_element(doc, ['div/bar'], default='default'), 'default')          self.assertTrue(xpath_element(doc, 'div/bar') is None) +        self.assertTrue(xpath_element(doc, ['div/bar']) is None) +        self.assertTrue(xpath_element(doc, ['div/bar'], 'div/baz') is None)          self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) +        self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar'], fatal=True) +        self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar', 'div/baz'], fatal=True)      def test_xpath_text(self):          testxml = '''<root> @@ -285,7 +296,7 @@ class TestUtil(unittest.TestCase):                  <p>Foo</p>              </div>          </root>''' -        doc = xml.etree.ElementTree.fromstring(testxml) +        doc = compat_etree_fromstring(testxml)          self.assertEqual(xpath_text(doc, 'div/p'), 'Foo')          self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')          self.assertTrue(xpath_text(doc, 'div/bar') is None) @@ -297,7 +308,7 @@ class TestUtil(unittest.TestCase):                  <p x="a">Foo</p>              </div>          </root>''' -        doc = xml.etree.ElementTree.fromstring(testxml) +        doc = compat_etree_fromstring(testxml)          self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a')          self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None)          self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d103ab9ad..a3e85264a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -14,6 +14,7 @@ import socket  import subprocess  import sys  import itertools +import xml.etree.ElementTree  try: @@ -212,6 +213,43 @@ try:  except ImportError:  # Python 2.6      from xml.parsers.expat import ExpatError as compat_xml_parse_error +if sys.version_info[0] >= 3: +    compat_etree_fromstring = xml.etree.ElementTree.fromstring +else: +    # python 2.x tries to encode unicode strings with ascii (see the +    # XMLParser._fixtext method) +    etree = xml.etree.ElementTree + +    try: +        _etree_iter = etree.Element.iter +    except AttributeError:  # Python <=2.6 +        def _etree_iter(root): +            for el in root.findall('*'): +                yield el +                for sub in _etree_iter(el): +                    yield sub + +    # on 2.6 XML doesn't have a parser argument, function copied from CPython +    # 2.7 source +    def _XML(text, parser=None): +        if not parser: +            parser = etree.XMLParser(target=etree.TreeBuilder()) +        parser.feed(text) +        return parser.close() + +    def _element_factory(*args, **kwargs): +        el = etree.Element(*args, **kwargs) +        for k, v in el.items(): +            if isinstance(v, bytes): +                el.set(k, v.decode('utf-8')) +        return el + +    def compat_etree_fromstring(text): +        doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) +        for el in _etree_iter(doc): +            if el.text is not None and isinstance(el.text, bytes): +                el.text = el.text.decode('utf-8') +        return doc  try:      from urllib.parse import parse_qs as compat_parse_qs @@ -507,6 +545,7 @@ __all__ = [      'compat_chr',      'compat_cookiejar',      'compat_cookies', +    'compat_etree_fromstring',      'compat_expanduser',      'compat_get_terminal_size',      'compat_getenv', diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 7f6143954..6170cc155 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -5,10 +5,10 @@ import io  import itertools  import os  import time -import xml.etree.ElementTree as etree  from .fragment import FragmentFD  from ..compat import ( +    compat_etree_fromstring,      compat_urlparse,      compat_urllib_error,      compat_urllib_parse_urlparse, @@ -290,7 +290,7 @@ class F4mFD(FragmentFD):          man_url = urlh.geturl()          manifest = urlh.read() -        doc = etree.fromstring(manifest) +        doc = compat_etree_fromstring(manifest)          formats = [(int(f.attrib.get('bitrate', -1)), f)                     for f in self._get_unencrypted_media(doc)]          if requested_bitrate is None: diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ from ..utils import (      parse_duration,      unified_strdate,      xpath_text, -    parse_xml,  ) +from ..compat import compat_etree_fromstring  class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):              raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)          if re.search(r'[\?&]rss($|[=&])', url): -            doc = parse_xml(webpage) +            doc = compat_etree_fromstring(webpage.encode('utf-8'))              if doc.tag == 'rss':                  return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2cdce1eb9..a55a6dbc9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  import re -import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import ( @@ -14,7 +13,10 @@ from ..utils import (      remove_end,      unescapeHTML,  ) -from ..compat import compat_HTTPError +from ..compat import ( +    compat_etree_fromstring, +    compat_HTTPError, +)  class BBCCoUkIE(InfoExtractor): @@ -344,7 +346,7 @@ class BBCCoUkIE(InfoExtractor):                  url, programme_id, 'Downloading media selection XML')          except ExtractorError as ee:              if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: -                media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) +                media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))              else:                  raise          return self._process_media_selector(media_selection, programme_id) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index ecc17ebeb..6c66a1236 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals  import re  import itertools  import json -import xml.etree.ElementTree as ET  from .common import InfoExtractor +from ..compat import ( +    compat_etree_fromstring, +)  from ..utils import (      int_or_none,      unified_strdate, @@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor):          except ValueError:              pass -        lq_doc = ET.fromstring(lq_page) +        lq_doc = compat_etree_fromstring(lq_page)          lq_durls = lq_doc.findall('./durl')          hq_doc = self._download_xml( diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..1686cdde1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,10 +3,10 @@ from __future__ import unicode_literals  import re  import json -import xml.etree.ElementTree  from .common import InfoExtractor  from ..compat import ( +    compat_etree_fromstring,      compat_parse_qs,      compat_str,      compat_urllib_parse, @@ -119,7 +119,7 @@ class BrightcoveIE(InfoExtractor):          object_str = fix_xml_ampersands(object_str)          try: -            object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) +            object_doc = compat_etree_fromstring(object_str.encode('utf-8'))          except compat_xml_parse_error:              return diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..5e263f8b5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,7 +10,6 @@ import re  import socket  import sys  import time -import xml.etree.ElementTree  from ..compat import (      compat_cookiejar, @@ -23,6 +22,7 @@ from ..compat import (      compat_urllib_request,      compat_urlparse,      compat_str, +    compat_etree_fromstring,  )  from ..utils import (      NO_DEFAULT, @@ -310,11 +310,11 @@ class InfoExtractor(object):      @classmethod      def ie_key(cls):          """A string for getting the InfoExtractor with get_info_extractor""" -        return cls.__name__[:-2] +        return compat_str(cls.__name__[:-2])      @property      def IE_NAME(self): -        return type(self).__name__[:-2] +        return compat_str(type(self).__name__[:-2])      def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):          """ Returns the response handle """ @@ -461,7 +461,7 @@ class InfoExtractor(object):              return xml_string          if transform_source:              xml_string = transform_source(xml_string) -        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) +        return compat_etree_fromstring(xml_string.encode('utf-8'))      def _download_json(self, url_or_request, video_id,                         note='Downloading JSON metadata', @@ -943,13 +943,14 @@ class InfoExtractor(object):              if re.match(r'^https?://', u)              else compat_urlparse.urljoin(m3u8_url, u)) -        m3u8_doc, urlh = self._download_webpage_handle( +        res = self._download_webpage_handle(              m3u8_url, video_id,              note=note or 'Downloading m3u8 information',              errnote=errnote or 'Failed to download m3u8 information',              fatal=fatal) -        if m3u8_doc is False: -            return m3u8_doc +        if res is False: +            return res +        m3u8_doc, urlh = res          m3u8_url = urlh.geturl()          last_info = None          last_media = None diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f8ce10111..0c9b8ca02 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -5,12 +5,12 @@ import re  import json  import base64  import zlib -import xml.etree.ElementTree  from hashlib import sha1  from math import pow, sqrt, floor  from .common import InfoExtractor  from ..compat import ( +    compat_etree_fromstring,      compat_urllib_parse,      compat_urllib_parse_unquote,      compat_urllib_request, @@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          return output      def _extract_subtitles(self, subtitle): -        sub_root = xml.etree.ElementTree.fromstring(subtitle) +        sub_root = compat_etree_fromstring(subtitle)          return [{              'ext': 'srt',              'data': self._convert_subtitles_to_srt(sub_root), diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 2cba82532..357a2196c 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,39 +1,92 @@  # encoding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from .brightcove import BrightcoveIE -from ..utils import ExtractorError +from ..compat import compat_urllib_request +from ..utils import ( +    float_or_none, +    int_or_none, +    parse_iso8601, +)  class EitbIE(InfoExtractor):      IE_NAME = 'eitb.tv' -    _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)'      _TEST = { -        'add_ie': ['Brightcove'], -        'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', +        'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/',          'md5': 'edf4436247185adee3ea18ce64c47998',          'info_dict': { -            'id': '2743577154001', +            'id': '4090227752001',              'ext': 'mp4',              'title': '60 minutos (Lasa y Zabala, 30 años)', -            # All videos from eitb has this description in the brightcove info -            'description': '.', -            'uploader': 'Euskal Telebista', +            'description': 'Programa de reportajes de actualidad.', +            'duration': 3996.76, +            'timestamp': 1381789200, +            'upload_date': '20131014', +            'tags': list,          },      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        chapter_id = mobj.group('chapter_id') -        webpage = self._download_webpage(url, chapter_id) -        bc_url = BrightcoveIE._extract_brightcove_url(webpage) -        if bc_url is None: -            raise ExtractorError('Could not extract the Brightcove url') -        # The BrightcoveExperience object doesn't contain the video id, we set -        # it manually -        bc_url += '&%40videoPlayer={0}'.format(chapter_id) -        return self.url_result(bc_url, BrightcoveIE.ie_key()) +        video_id = self._match_id(url) + +        video = self._download_json( +            'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, +            video_id, 'Downloading video JSON') + +        media = video['web_media'][0] + +        formats = [] +        for rendition in media['RENDITIONS']: +            video_url = rendition.get('PMD_URL') +            if not video_url: +                continue +            tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000) +            format_id = 'http' +            if tbr: +                format_id += '-%d' % int(tbr) +            formats.append({ +                'url': rendition['PMD_URL'], +                'format_id': format_id, +                'width': int_or_none(rendition.get('FRAME_WIDTH')), +                'height': int_or_none(rendition.get('FRAME_HEIGHT')), +                'tbr': tbr, +            }) + +        hls_url = media.get('HLS_SURL') +        if hls_url: +            request = compat_urllib_request.Request( +                'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', +                headers={'Referer': url}) +            token_data = self._download_json( +                request, video_id, 'Downloading auth token', fatal=False) +            if token_data: +                token = token_data.get('token') +                if token: +                    m3u8_formats = self._extract_m3u8_formats( +                        '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False) +                    if m3u8_formats: +                        formats.extend(m3u8_formats) + +        hds_url = media.get('HDS_SURL') +        if hds_url: +            f4m_formats = self._extract_f4m_formats( +                '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), +                video_id, f4m_id='hds', fatal=False) +            if f4m_formats: +                formats.extend(f4m_formats) + +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], +            'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'), +            'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'), +            'duration': float_or_none(media.get('LENGTH'), 1000), +            'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '), +            'tags': media.get('TAGS'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..ee5419f51 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ import sys  from .common import InfoExtractor  from .youtube import YoutubeIE  from ..compat import ( +    compat_etree_fromstring,      compat_urllib_parse_unquote,      compat_urllib_request,      compat_urlparse, @@ -21,7 +22,6 @@ from ..utils import (      HEADRequest,      is_html,      orderedSet, -    parse_xml,      smuggle_url,      unescapeHTML,      unified_strdate, @@ -141,6 +141,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Automatics, robotics and biocybernetics',                  'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', +                'upload_date': '20130627',                  'formats': 'mincount:16',                  'subtitles': 'mincount:1',              }, @@ -1237,7 +1238,7 @@ class GenericIE(InfoExtractor):          # Is it an RSS feed, a SMIL file or a XSPF playlist?          try: -            doc = parse_xml(webpage) +            doc = compat_etree_fromstring(webpage.encode('utf-8'))              if doc.tag == 'rss':                  return self._extract_rss(url, video_id, doc)              elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index fc7499958..88334889e 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,64 +1,169 @@ +# coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( +    determine_ext, +    int_or_none, +    parse_duration, +    parse_iso8601, +    xpath_text, +)  class MDRIE(InfoExtractor): -    _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)' +    IE_DESC = 'MDR.DE and KiKA' +    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' -    # No tests, MDR regularily deletes its videos -    _TEST = { +    _TESTS = [{ +        # MDR regularily deletes its videos          'url': 'http://www.mdr.de/fakt/video189002.html',          'only_matching': True, -    } +    }, { +        # audio +        'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', +        'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', +        'info_dict': { +            'id': '1312272', +            'ext': 'mp3', +            'title': 'Feuilleton vom 30. Oktober 2015', +            'duration': 250, +            'uploader': 'MITTELDEUTSCHER RUNDFUNK', +        }, +    }, { +        'url': 'http://www.kika.de/baumhaus/videos/video19636.html', +        'md5': '4930515e36b06c111213e80d1e4aad0e', +        'info_dict': { +            'id': '19636', +            'ext': 'mp4', +            'title': 'Baumhaus vom 30. Oktober 2015', +            'duration': 134, +            'uploader': 'KIKA', +        }, +    }, { +        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', +        'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', +        'info_dict': { +            'id': '8182', +            'ext': 'mp4', +            'title': 'Beutolomäus und der geheime Weihnachtswunsch', +            'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', +            'timestamp': 1419047100, +            'upload_date': '20141220', +            'duration': 4628, +            'uploader': 'KIKA', +        }, +    }, { +        'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', +        'only_matching': True, +    }, { +        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', +        'only_matching': True, +    }]      def _real_extract(self, url): -        m = re.match(self._VALID_URL, url) -        video_id = m.group('video_id') -        domain = m.group('domain') +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        data_url = self._search_regex( +            r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', +            webpage, 'data url', group='url') -        # determine title and media streams from webpage -        html = self._download_webpage(url, video_id) +        doc = self._download_xml( +            compat_urlparse.urljoin(url, data_url), video_id) -        title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title') -        xmlurl = self._search_regex( -            r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') +        title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) -        doc = self._download_xml(domain + xmlurl, video_id)          formats = [] -        for a in doc.findall('./assets/asset'): -            url_el = a.find('./progressiveDownloadUrl') -            if url_el is None: -                continue -            abr = int(a.find('bitrateAudio').text) // 1000 -            media_type = a.find('mediaType').text -            format = { -                'abr': abr, -                'filesize': int(a.find('fileSize').text), -                'url': url_el.text, -            } - -            vbr_el = a.find('bitrateVideo') -            if vbr_el is None: -                format.update({ -                    'vcodec': 'none', -                    'format_id': '%s-%d' % (media_type, abr), -                }) -            else: -                vbr = int(vbr_el.text) // 1000 -                format.update({ -                    'vbr': vbr, -                    'width': int(a.find('frameWidth').text), -                    'height': int(a.find('frameHeight').text), -                    'format_id': '%s-%d' % (media_type, vbr), -                }) -            formats.append(format) +        processed_urls = [] +        for asset in doc.findall('./assets/asset'): +            for source in ( +                    'progressiveDownload', +                    'dynamicHttpStreamingRedirector', +                    'adaptiveHttpStreamingRedirector'): +                url_el = asset.find('./%sUrl' % source) +                if url_el is None: +                    continue + +                video_url = url_el.text +                if video_url in processed_urls: +                    continue + +                processed_urls.append(video_url) + +                vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) +                abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + +                ext = determine_ext(url_el.text) +                if ext == 'm3u8': +                    url_formats = self._extract_m3u8_formats( +                        video_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        preference=0, m3u8_id='HLS', fatal=False) +                elif ext == 'f4m': +                    url_formats = self._extract_f4m_formats( +                        video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, +                        preference=0, f4m_id='HDS', fatal=False) +                else: +                    media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') +                    vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) +                    abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) +                    filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + +                    f = { +                        'url': video_url, +                        'format_id': '%s-%d' % (media_type, vbr or abr), +                        'filesize': filesize, +                        'abr': abr, +                        'preference': 1, +                    } + +                    if vbr: +                        width = int_or_none(xpath_text(asset, './frameWidth', 'width')) +                        height = int_or_none(xpath_text(asset, './frameHeight', 'height')) +                        f.update({ +                            'vbr': vbr, +                            'width': width, +                            'height': height, +                        }) + +                    url_formats = [f] + +                if not url_formats: +                    continue + +                if not vbr: +                    for f in url_formats: +                        abr = f.get('tbr') or abr +                        if 'tbr' in f: +                            del f['tbr'] +                        f.update({ +                            'abr': abr, +                            'vcodec': 'none', +                        }) + +                formats.extend(url_formats) +          self._sort_formats(formats) +        description = xpath_text(doc, './broadcast/broadcastDescription', 'description') +        timestamp = parse_iso8601( +            xpath_text( +                doc, [ +                    './broadcast/broadcastDate', +                    './broadcast/broadcastStartDate', +                    './broadcast/broadcastEndDate'], +                'timestamp', default=None)) +        duration = parse_duration(xpath_text(doc, './duration', 'duration')) +        uploader = xpath_text(doc, './rights', 'uploader') +          return {              'id': video_id,              'title': title, +            'description': description, +            'timestamp': timestamp, +            'duration': duration, +            'uploader': uploader,              'formats': formats,          } diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3142fcde2..c595f2077 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,7 +1,10 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( +    compat_urllib_parse, +    compat_urlparse, +)  from ..utils import (      encode_dict,      get_element_by_attribute, @@ -15,7 +18,7 @@ class MiTeleIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', -        'md5': '757b0b66cbd7e0a97226d7d3156cb3e9', +        'md5': '0ff1a13aebb35d9bc14081ff633dd324',          'info_dict': {              'id': '0NF1jJnxS1Wu3pHrmvFyw2',              'display_id': 'programa-144', @@ -34,6 +37,7 @@ class MiTeleIE(InfoExtractor):          config_url = self._search_regex(              r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') +        config_url = compat_urlparse.urljoin(url, config_url)          config = self._download_json(              config_url, display_id, 'Downloading config JSON') diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c17094f81..4c0de354f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,10 +1,10 @@  from __future__ import unicode_literals  import re -import xml.etree.ElementTree  from .common import InfoExtractor  from ..compat import ( +    compat_etree_fromstring,      compat_urllib_request,  )  from ..utils import ( @@ -97,7 +97,7 @@ class VevoIE(InfoExtractor):          if last_version['version'] == -1:              raise ExtractorError('Unable to extract last version of the video') -        renditions = xml.etree.ElementTree.fromstring(last_version['data']) +        renditions = compat_etree_fromstring(last_version['data'])          formats = []          # Already sorted from worst to best quality          for rend in renditions.findall('rendition'): @@ -114,7 +114,7 @@ class VevoIE(InfoExtractor):      def _formats_from_smil(self, smil_xml):          formats = [] -        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) +        smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8'))          els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video')          for el in els:              src = el.attrib['src'] diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 94f9e9be9..cd3f50a63 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import ( -    find_xpath_attr,      int_or_none, +    parse_iso8601,  ) @@ -18,33 +18,35 @@ class VideofyMeIE(InfoExtractor):              'id': '1100701',              'ext': 'mp4',              'title': 'This is VideofyMe', -            'description': None, +            'description': '', +            'upload_date': '20130326', +            'timestamp': 1364288959,              'uploader': 'VideofyMe',              'uploader_id': 'thisisvideofyme',              'view_count': int, +            'likes': int, +            'comment_count': int,          }, -      }      def _real_extract(self, url):          video_id = self._match_id(url) -        config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id, -                                    video_id) -        video = config.find('video') -        sources = video.find('sources') -        url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) -                                          for key in ['on', 'av', 'off']] if node is not None) -        video_url = url_node.find('url').text -        view_count = int_or_none(self._search_regex( -            r'([0-9]+)', video.find('views').text, 'view count', fatal=False)) + +        config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo'] + +        video = config.get('video') +        blog = config.get('blog', {})          return {              'id': video_id, -            'title': video.find('title').text, -            'url': video_url, -            'thumbnail': video.find('thumb').text, -            'description': video.find('description').text, -            'uploader': config.find('blog/name').text, -            'uploader_id': video.find('identifier').text, -            'view_count': view_count, +            'title': video['title'], +            'url': video['sources']['source']['url'], +            'thumbnail': video.get('thumb'), +            'description': video.get('description'), +            'timestamp': parse_iso8601(video.get('date')), +            'uploader': blog.get('name'), +            'uploader_id': blog.get('identifier'), +            'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)), +            'likes': int_or_none(video.get('likes')), +            'comment_count': int_or_none(video.get('nrOfComments')),          } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 558c9c7d5..d39f313a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -36,6 +36,7 @@ import zlib  from .compat import (      compat_basestring,      compat_chr, +    compat_etree_fromstring,      compat_html_entities,      compat_http_client,      compat_kwargs, @@ -178,10 +179,19 @@ def xpath_with_ns(path, ns_map):  def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): -    if sys.version_info < (2, 7):  # Crazy 2.6 -        xpath = xpath.encode('ascii') +    def _find_xpath(xpath): +        if sys.version_info < (2, 7):  # Crazy 2.6 +            xpath = xpath.encode('ascii') +        return node.find(xpath) + +    if isinstance(xpath, (str, compat_str)): +        n = _find_xpath(xpath) +    else: +        for xp in xpath: +            n = _find_xpath(xp) +            if n is not None: +                break -    n = node.find(xpath)      if n is None:          if default is not NO_DEFAULT:              return default @@ -356,7 +366,7 @@ def sanitize_path(s):      if drive_or_unc:          norm_path.pop(0)      sanitized_path = [ -        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) +        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)          for path_part in norm_path]      if drive_or_unc:          sanitized_path.insert(0, drive_or_unc + os.path.sep) @@ -901,7 +911,8 @@ def unified_strdate(date_str, day_first=True):          timetuple = email.utils.parsedate_tz(date_str)          if timetuple:              upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') -    return upload_date +    if upload_date is not None: +        return compat_str(upload_date)  def determine_ext(url, default_ext='unknown_video'): @@ -1656,29 +1667,6 @@ def encode_dict(d, encoding='utf-8'):      return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: -    etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError:  # Python <=2.6 -    etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): -    class TreeBuilder(xml.etree.ElementTree.TreeBuilder): -        def doctype(self, name, pubid, system): -            pass  # Ignore doctypes - -    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) -    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} -    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) -    # Fix up XML parser in Python 2.x -    if sys.version_info < (3, 0): -        for n in etree_iter(tree): -            if n.text is not None: -                if not isinstance(n.text, compat_str): -                    n.text = n.text.decode('utf-8') -    return tree - -  US_RATINGS = {      'G': 0,      'PG': 10, @@ -1979,7 +1967,7 @@ def dfxp2srt(dfxp_data):          return out -    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) +    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))      out = []      paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 125e8ccf5..6ef482b78 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.10.24' +__version__ = '2015.11.02' | 
