diff options
| author | Random User <rndusr@posteo.de> | 2017-03-25 21:36:59 +0100 | 
|---|---|---|
| committer | Random User <rndusr@posteo.de> | 2017-03-25 21:36:59 +0100 | 
| commit | 4f06c1c9fcbfbc74b81b5fa89a616914b5ce5aad (patch) | |
| tree | a51b702e001d350b908780a119f76d8ea706d511 | |
| parent | c73e330e7adc9c0c15ac51aeea8fbb7dad95351a (diff) | |
| parent | 942b44a0525f677924c660bcb00902d705d91fc2 (diff) | |
Merge branch 'master' of github.com-rndusr:rg3/youtube-dl into fix/str-item-assignment
75 files changed, 2456 insertions, 951 deletions
| diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6374f7c25..dfff41d2d 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@  --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.27*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.27** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.03.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.03.24**  ### Before submitting an *issue* make sure you have:  - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>  [debug] User config: []  [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']  [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.02.27 +[debug] youtube-dl version 2017.03.24  [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2  [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4  [debug] Proxy map: {} @@ -202,3 +202,11 @@ Fabian Stahl  Bagira  Odd Stråbø  Philip Herzog +Thomas Christlieb +Marek Rusinowski +Tobias Gruetzmacher +Olivier Bilodeau +Lars Vierbergen +Juanjo Benages +Xiao Di Guan +Thomas Winant @@ -1,7 +1,149 @@  version <unreleased>  Extractors -+ [daisuki] Add new extractor (#2486, #3186, #4738, #6175, #7776, #10060) +* [afreecatv] Fix extraction (#12179) + + +version 2017.03.24 + +Extractors +- [9c9media] Remove mp4 URL extraction request ++ [bellmedia] Add support for etalk.ca and space.ca (#12447) +* [channel9] Fix extraction (#11323) +* [cloudy] Fix extraction (#12525) ++ [hbo] Add support for free episode URLs and new formats extraction (#12519) +* [condenast] Fix extraction and style (#12526) +* [viu] Relax URL regular expression (#12529) + + +version 2017.03.22 + +Extractors +- [pluralsight] Omit module title from video title (#12506) +* [pornhub] Decode obfuscated video URL (#12470, #12515) +* [senateisvp] Allow https URL scheme for embeds (#12512) + + +version 2017.03.20 + +Core ++ [YoutubeDL] Allow multiple input URLs to be used with stdout (-) as +  output template ++ [adobepass] Detect and output error on authz token extraction (#12472) + +Extractors ++ [bostonglobe] Add extractor for bostonglobe.com (#12099) ++ [toongoggles] Add support for toongoggles.com (#12171) ++ [medialaan] Add support for Medialaan sites (#9974, #11912) ++ [discoverynetworks] Add support for more domains and bypass geo restiction +* [openload] Fix extraction (#10408) + + +version 2017.03.16 + +Core ++ [postprocessor/ffmpeg] Add support for flac ++ [extractor/common] Extract SMIL formats from jwplayer + +Extractors ++ [generic] Add forgotten return for jwplayer formats +* [redbulltv] Improve extraction + + +version 2017.03.15 + +Core +* Fix missing subtitles if --add-metadata is used (#12423) + +Extractors +* [facebook] Make title optional (#12443) ++ [mitele] Add support for ooyala videos (#12430) +* [openload] Fix extraction (#12435, #12446) +* [streamable] Update API URL (#12433) ++ [crunchyroll] Extract season name (#12428) +* [discoverygo] Bypass geo restriction ++ [discoverygo:playlist] Add support for playlists (#12424) + + +version 2017.03.10 + +Extractors +* [generic] Make title optional for jwplayer embeds (#12410) +* [wdr:maus] Fix extraction (#12373) +* [prosiebensat1] Improve title extraction (#12318, #12327) +* [dplayit] Separate and rewrite extractor and bypass geo restriction (#12393) +* [miomio] Fix extraction (#12291, #12388, #12402) +* [telequebec] Fix description extraction (#12399) +* [openload] Fix extraction (#12357) +* [brightcove:legacy] Relax videoPlayer validation check (#12381) + + +version 2017.03.07 + +Core +* Metadata are now added after conversion (#5594) + +Extractors +* [soundcloud] Update client id (#12376) +* [openload] Fix extraction (#10408, #12357) + + +version 2017.03.06 + +Core ++ [utils] Process bytestrings in urljoin (#12369) +* [extractor/common] Improve height extraction and extract bitrate +* [extractor/common] Move jwplayer formats extraction in separate method ++ [external:ffmpeg] Limit test download size to 10KiB (#12362) + +Extractors ++ [drtv] Add geo countries to GeoRestrictedError ++ [drtv:live] Bypass geo restriction ++ [tunepk] Add extractor (#12197, #12243) + + +version 2017.03.05 + +Extractors ++ [twitch] Add basic support for two-factor authentication (#11974) ++ [vier] Add support for vijf.be (#12304) ++ [redbulltv] Add support for redbull.tv (#3919, #11948) +* [douyutv] Switch to the PC API to escape the 5-min limitation (#12316) ++ [generic] Add support for rutube embeds ++ [rutube] Relax URL regular expression ++ [vrak] Add support for vrak.tv (#11452) ++ [brightcove:new] Add ability to smuggle geo_countries into URL ++ [brightcove:new] Raise GeoRestrictedError +* [go] Relax URL regular expression (#12341) +* [24video] Use original host for requests (#12339) +* [ruutu] Disable DASH formats (#12322) + + +version 2017.03.02 + +Core ++ [adobepass] Add support for Charter Spectrum (#11465) +* [YoutubeDL] Don't sanitize identifiers in output template (#12317) + +Extractors +* [facebook] Fix extraction (#12323, #12330) +* [youtube] Mark errors about rental videos as expected (#12324) ++ [npo] Add support for audio +* [npo] Adapt to app.php API (#12311, #12320) + + +version 2017.02.28 + +Core ++ [utils] Add bytes_to_long and long_to_bytes ++ [utils] Add pkcs1pad ++ [aes] Add aes_cbc_encrypt + +Extractors ++ [azmedien:showplaylist] Add support for show playlists (#12160) ++ [youtube:playlist] Recognize another playlist pattern (#11928, #12286) ++ [daisuki] Add support for daisuki.net (#2486, #3186, #4738, #6175, #7776, +  #10060)  * [douyu] Fix extraction (#12301) @@ -375,8 +375,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo                                       (requires ffmpeg or avconv and ffprobe or                                       avprobe)      --audio-format FORMAT            Specify audio format: "best", "aac", -                                     "vorbis", "mp3", "m4a", "opus", or "wav"; -                                     "best" by default; No effect without -x +                                     "flac", "mp3", "m4a", "opus", "vorbis", or +                                     "wav"; "best" by default; No effect without +                                     -x      --audio-quality QUALITY          Specify ffmpeg/avconv audio quality, insert                                       a value between 0 (better) and 9 (worse)                                       for VBR or a specific bitrate like 128K diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1b01c6d9d..7c99ba3c2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -78,6 +78,7 @@   - **awaan:video**   - **AZMedien**: AZ Medien videos   - **AZMedienPlaylist**: AZ Medien playlists + - **AZMedienShowPlaylist**: AZ Medien show playlists   - **Azubu**   - **AzubuLive**   - **BaiduVideo**: 百度视频 @@ -107,6 +108,7 @@   - **blinkx**   - **Bloomberg**   - **BokeCC** + - **BostonGlobe**   - **Bpb**: Bundeszentrale für politische Bildung   - **BR**: Bayerischer Rundfunk Mediathek   - **BravoTV** @@ -191,6 +193,8 @@   - **dailymotion:playlist**   - **dailymotion:user**   - **DailymotionCloud** + - **Daisuki** + - **DaisukiPlaylist**   - **daum.net**   - **daum.net:clip**   - **daum.net:playlist** @@ -205,10 +209,13 @@   - **Digiteka**   - **Discovery**   - **DiscoveryGo** + - **DiscoveryGoPlaylist** + - **DiscoveryNetworksDe**   - **Disney**   - **Dotsub**   - **DouyuTV**: 斗鱼   - **DPlay** + - **DPlayIt**   - **dramafever**   - **dramafever:series**   - **DRBonanza** @@ -305,8 +312,8 @@   - **GPUTechConf**   - **Groupon**   - **Hark** - - **HBO** - - **HBOEpisode** + - **hbo** + - **hbo:episode**   - **HearThisAt**   - **Heise**   - **HellPorno** @@ -420,6 +427,7 @@   - **MatchTV**   - **MDR**: MDR.DE and KiKA   - **media.ccc.de** + - **Medialaan**   - **Meipai**: 美拍   - **MelonVOD**   - **META** @@ -623,6 +631,7 @@   - **RaiTV**   - **RBMARadio**   - **RDS**: RDS.ca + - **RedBullTV**   - **RedTube**   - **RegioTV**   - **RENTV** @@ -771,12 +780,12 @@   - **ThisAV**   - **ThisOldHouse**   - **tinypic**: tinypic.com videos - - **tlc.de**   - **TMZ**   - **TMZArticle**   - **TNAFlix**   - **TNAFlixNetworkEmbed**   - **toggle** + - **ToonGoggles**   - **Tosh**: Tosh.0   - **tou.tv**   - **Toypics**: Toypics user profile @@ -794,6 +803,7 @@   - **tunein:program**   - **tunein:station**   - **tunein:topic** + - **TunePk**   - **Turbo**   - **Tutv**   - **tv.dfb.de** @@ -913,6 +923,7 @@   - **VoxMedia**   - **Vporn**   - **vpro**: npo.nl and ntr.nl + - **Vrak**   - **VRT**   - **vube**: Vube.com   - **VuClip** diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 437c7270e..881197afb 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -8,7 +8,7 @@ import sys  import unittest  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL +from test.helper import FakeYDL, expect_dict  from youtube_dl.extractor.common import InfoExtractor  from youtube_dl.extractor import YoutubeIE, get_info_extractor  from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError @@ -84,6 +84,97 @@ class TestInfoExtractor(unittest.TestCase):          self.assertRaises(ExtractorError, self.ie._download_json, uri, None)          self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) +    def test_extract_jwplayer_data_realworld(self): +        # from http://www.suffolk.edu/sjc/ +        expect_dict( +            self, +            self.ie._extract_jwplayer_data(r''' +                <script type='text/javascript'> +                    jwplayer('my-video').setup({ +                        file: 'rtmp://192.138.214.154/live/sjclive', +                        fallback: 'true', +                        width: '95%', +                      aspectratio: '16:9', +                      primary: 'flash', +                      mediaid:'XEgvuql4' +                    }); +                </script> +                ''', None, require_title=False), +            { +                'id': 'XEgvuql4', +                'formats': [{ +                    'url': 'rtmp://192.138.214.154/live/sjclive', +                    'ext': 'flv' +                }] +            }) + +        # from https://www.pornoxo.com/videos/7564/striptease-from-sexy-secretary/ +        expect_dict( +            self, +            self.ie._extract_jwplayer_data(r''' +<script type="text/javascript"> +    jwplayer("mediaplayer").setup({ +        'videoid': "7564", +        'width': "100%", +        'aspectratio': "16:9", +        'stretching': "exactfit", +        'autostart': 'false', +        'flashplayer': "https://t04.vipstreamservice.com/jwplayer/v5.10/player.swf", +        'file': "https://cdn.pornoxo.com/key=MF+oEbaxqTKb50P-w9G3nA,end=1489689259,ip=104.199.146.27/ip=104.199.146.27/speed=6573765/buffer=3.0/2009-12/4b2157147afe5efa93ce1978e0265289c193874e02597.flv", +        'image': "https://t03.vipstreamservice.com/thumbs/pxo-full/2009-12/14/a4b2157147afe5efa93ce1978e0265289c193874e02597.flv-full-13.jpg", +        'filefallback': "https://cdn.pornoxo.com/key=9ZPsTR5EvPLQrBaak2MUGA,end=1489689259,ip=104.199.146.27/ip=104.199.146.27/speed=6573765/buffer=3.0/2009-12/m_4b2157147afe5efa93ce1978e0265289c193874e02597.mp4", +        'logo.hide': true, +        'skin': "https://t04.vipstreamservice.com/jwplayer/skin/modieus-blk.zip", +        'plugins': "https://t04.vipstreamservice.com/jwplayer/dock/dockableskinnableplugin.swf", +        'dockableskinnableplugin.piclink': "/index.php?key=ajax-videothumbsn&vid=7564&data=2009-12--14--4b2157147afe5efa93ce1978e0265289c193874e02597.flv--17370", +        'controlbar': 'bottom', +        'modes': [ +            {type: 'flash', src: 'https://t04.vipstreamservice.com/jwplayer/v5.10/player.swf'} +        ], +        'provider': 'http' +    }); +    //noinspection JSAnnotator +    invideo.setup({ +        adsUrl: "/banner-iframe/?zoneId=32", +        adsUrl2: "", +        autostart: false +    }); +</script> +            ''', 'dummy', require_title=False), +            { +                'thumbnail': 'https://t03.vipstreamservice.com/thumbs/pxo-full/2009-12/14/a4b2157147afe5efa93ce1978e0265289c193874e02597.flv-full-13.jpg', +                'formats': [{ +                    'url': 'https://cdn.pornoxo.com/key=MF+oEbaxqTKb50P-w9G3nA,end=1489689259,ip=104.199.146.27/ip=104.199.146.27/speed=6573765/buffer=3.0/2009-12/4b2157147afe5efa93ce1978e0265289c193874e02597.flv', +                    'ext': 'flv' +                }] +            }) + +        # from http://www.indiedb.com/games/king-machine/videos +        expect_dict( +            self, +            self.ie._extract_jwplayer_data(r''' +<script> +jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/\/www.indiedb.com\/","displaytitle":false,"autostart":false,"repeat":false,"title":"king machine trailer 1","sharing":{"link":"http:\/\/www.indiedb.com\/games\/king-machine\/videos\/king-machine-trailer-1","code":"<iframe width=\"560\" height=\"315\" src=\"http:\/\/www.indiedb.com\/media\/iframe\/1522983\" frameborder=\"0\" allowfullscreen><\/iframe><br><a href=\"http:\/\/www.indiedb.com\/games\/king-machine\/videos\/king-machine-trailer-1\">king machine trailer 1 - Indie DB<\/a>"},"related":{"file":"http:\/\/rss.indiedb.com\/media\/recommended\/1522983\/feed\/rss.xml","dimensions":"160x120","onclick":"link"},"sources":[{"file":"http:\/\/cdn.dbolical.com\/cache\/videos\/games\/1\/50\/49678\/encode_mp4\/king-machine-trailer.mp4","label":"360p SD","default":"true"},{"file":"http:\/\/cdn.dbolical.com\/cache\/videos\/games\/1\/50\/49678\/encode720p_mp4\/king-machine-trailer.mp4","label":"720p HD"}],"image":"http:\/\/media.indiedb.com\/cache\/images\/games\/1\/50\/49678\/thumb_620x2000\/king-machine-trailer.mp4.jpg","advertising":{"client":"vast","tag":"http:\/\/ads.intergi.com\/adrawdata\/3.0\/5205\/4251742\/0\/1013\/ADTECH;cors=yes;width=560;height=315;referring_url=http:\/\/www.indiedb.com\/games\/king-machine\/videos\/king-machine-trailer-1;content_url=http:\/\/www.indiedb.com\/games\/king-machine\/videos\/king-machine-trailer-1;media_id=1522983;title=king+machine+trailer+1;device=__DEVICE__;model=__MODEL__;os=Windows+OS;osversion=__OSVERSION__;ua=__UA__;ip=109.171.17.81;uniqueid=1522983;tags=__TAGS__;number=58cac25928151;time=1489683033"},"width":620,"height":349}).once("play", function(event) { +            videoAnalytics("play"); +}).once("complete", function(event) { +    videoAnalytics("completed"); +}); +</script> +                ''', 'dummy'), +            { +                'title': 'king machine trailer 1', +                'thumbnail': 'http://media.indiedb.com/cache/images/games/1/50/49678/thumb_620x2000/king-machine-trailer.mp4.jpg', +                'formats': [{ +                    'url': 'http://cdn.dbolical.com/cache/videos/games/1/50/49678/encode_mp4/king-machine-trailer.mp4', +                    'height': 360, +                    'ext': 'mp4' +                }, { +                    'url': 'http://cdn.dbolical.com/cache/videos/games/1/50/49678/encode720p_mp4/king-machine-trailer.mp4', +                    'height': 720, +                    'ext': 'mp4' +                }] +            }) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_compat.py b/test/test_compat.py index b57424948..d6c54e135 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -27,11 +27,11 @@ from youtube_dl.compat import (  class TestCompat(unittest.TestCase):      def test_compat_getenv(self):          test_str = 'тест' -        compat_setenv('YOUTUBE-DL-TEST', test_str) -        self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) +        compat_setenv('YOUTUBE_DL_COMPAT_GETENV', test_str) +        self.assertEqual(compat_getenv('YOUTUBE_DL_COMPAT_GETENV'), test_str)      def test_compat_setenv(self): -        test_var = 'YOUTUBE-DL-TEST' +        test_var = 'YOUTUBE_DL_COMPAT_SETENV'          test_str = 'тест'          compat_setenv(test_var, test_str)          compat_getenv(test_var) diff --git a/test/test_download.py b/test/test_download.py index 30034f978..01a8bcb89 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -71,6 +71,18 @@ class TestDownload(unittest.TestCase):      maxDiff = None +    def __str__(self): +        """Identify each test with the `add_ie` attribute, if available.""" + +        def strclass(cls): +            """From 2.7's unittest; 2.6 had _strclass so we can't import it.""" +            return '%s.%s' % (cls.__module__, cls.__name__) + +        add_ie = getattr(self, self._testMethodName).add_ie +        return '%s (%s)%s:' % (self._testMethodName, +                               strclass(self.__class__), +                               ' [%s]' % add_ie if add_ie else '') +      def setUp(self):          self.defs = defs @@ -233,6 +245,8 @@ for n, test_case in enumerate(defs):          i += 1      test_method = generator(test_case, tname)      test_method.__name__ = str(tname) +    ie_list = test_case.get('add_ie') +    test_method.add_ie = ie_list and ','.join(ie_list)      setattr(TestDownload, test_method.__name__, test_method)      del test_method diff --git a/test/test_utils.py b/test/test_utils.py index aefd94518..aa4569b81 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -56,6 +56,7 @@ from youtube_dl.utils import (      read_batch_urls,      sanitize_filename,      sanitize_path, +    expand_path,      prepend_extension,      replace_extension,      remove_start, @@ -95,6 +96,8 @@ from youtube_dl.utils import (  from youtube_dl.compat import (      compat_chr,      compat_etree_fromstring, +    compat_getenv, +    compat_setenv,      compat_urlparse,      compat_parse_qs,  ) @@ -214,6 +217,18 @@ class TestUtil(unittest.TestCase):          self.assertEqual(sanitize_path('./abc'), 'abc')          self.assertEqual(sanitize_path('./../abc'), '..\\abc') +    def test_expand_path(self): +        def env(var): +            return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) + +        compat_setenv('YOUTUBE_DL_EXPATH_PATH', 'expanded') +        self.assertEqual(expand_path(env('YOUTUBE_DL_EXPATH_PATH')), 'expanded') +        self.assertEqual(expand_path(env('HOME')), compat_getenv('HOME')) +        self.assertEqual(expand_path('~'), compat_getenv('HOME')) +        self.assertEqual( +            expand_path('~/%s' % env('YOUTUBE_DL_EXPATH_PATH')), +            '%s/expanded' % compat_getenv('HOME')) +      def test_prepend_extension(self):          self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext')          self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') @@ -455,6 +470,9 @@ class TestUtil(unittest.TestCase):      def test_urljoin(self):          self.assertEqual(urljoin('http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') +        self.assertEqual(urljoin(b'http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') +        self.assertEqual(urljoin('http://foo.de/', b'/a/b/c.txt'), 'http://foo.de/a/b/c.txt') +        self.assertEqual(urljoin(b'http://foo.de/', b'/a/b/c.txt'), 'http://foo.de/a/b/c.txt')          self.assertEqual(urljoin('//foo.de/', '/a/b/c.txt'), '//foo.de/a/b/c.txt')          self.assertEqual(urljoin('http://foo.de/', 'a/b/c.txt'), 'http://foo.de/a/b/c.txt')          self.assertEqual(urljoin('http://foo.de', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f7254560c..21586f0f4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -29,7 +29,6 @@ import random  from .compat import (      compat_basestring,      compat_cookiejar, -    compat_expanduser,      compat_get_terminal_size,      compat_http_client,      compat_kwargs, @@ -54,6 +53,7 @@ from .utils import (      encode_compat_str,      encodeFilename,      error_to_compat_str, +    expand_path,      ExtractorError,      format_bytes,      formatSeconds, @@ -616,7 +616,7 @@ class YoutubeDL(object):              sanitize = lambda k, v: sanitize_filename(                  compat_str(v),                  restricted=self.params.get('restrictfilenames'), -                is_id=(k == 'id')) +                is_id=(k == 'id' or k.endswith('_id')))              template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))                                   for k, v in template_dict.items()                                   if v is not None and not isinstance(v, (list, tuple, dict))) @@ -672,7 +672,7 @@ class YoutubeDL(object):                          FORMAT_RE.format(numeric_field),                          r'%({0})s'.format(numeric_field), outtmpl) -            tmpl = compat_expanduser(outtmpl) +            tmpl = expand_path(outtmpl)              filename = tmpl % template_dict              # Temporary fix for #4787              # 'Treat' all problem characters by passing filename through preferredencoding @@ -1872,6 +1872,7 @@ class YoutubeDL(object):          """Download a given list of URLs."""          outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)          if (len(url_list) > 1 and +                outtmpl != '-' and                  '%' not in outtmpl and                  self.params.get('max_downloads') != 1):              raise SameFileError(outtmpl) @@ -2169,7 +2170,7 @@ class YoutubeDL(object):          if opts_cookiefile is None:              self.cookiejar = compat_cookiejar.CookieJar()          else: -            opts_cookiefile = compat_expanduser(opts_cookiefile) +            opts_cookiefile = expand_path(opts_cookiefile)              self.cookiejar = compat_cookiejar.MozillaCookieJar(                  opts_cookiefile)              if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0c401baa6..f15606568 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -16,7 +16,6 @@ from .options import (      parseOpts,  )  from .compat import ( -    compat_expanduser,      compat_getpass,      compat_shlex_split,      workaround_optparse_bug9161, @@ -26,6 +25,7 @@ from .utils import (      decodeOption,      DEFAULT_OUTTMPL,      DownloadError, +    expand_path,      match_filter_func,      MaxDownloadsReached,      preferredencoding, @@ -88,7 +88,7 @@ def _real_main(argv=None):                  batchfd = sys.stdin              else:                  batchfd = io.open( -                    compat_expanduser(opts.batchfile), +                    expand_path(opts.batchfile),                      'r', encoding='utf-8', errors='ignore')              batch_urls = read_batch_urls(batchfd)              if opts.verbose: @@ -196,7 +196,7 @@ def _real_main(argv=None):      if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart:          raise ValueError('Playlist end must be greater than playlist start')      if opts.extractaudio: -        if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']: +        if opts.audioformat not in ['best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:              parser.error('invalid audio format specified')      if opts.audioquality:          opts.audioquality = opts.audioquality.strip('k').strip('K') @@ -238,18 +238,15 @@ def _real_main(argv=None):      any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json      any_printing = opts.print_json -    download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive +    download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive      # PostProcessors      postprocessors = [] -    # Add the metadata pp first, the other pps will copy it      if opts.metafromtitle:          postprocessors.append({              'key': 'MetadataFromTitle',              'titleformat': opts.metafromtitle          }) -    if opts.addmetadata: -        postprocessors.append({'key': 'FFmpegMetadata'})      if opts.extractaudio:          postprocessors.append({              'key': 'FFmpegExtractAudio', @@ -262,6 +259,16 @@ def _real_main(argv=None):              'key': 'FFmpegVideoConvertor',              'preferedformat': opts.recodevideo,          }) +    # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and +    # FFmpegExtractAudioPP as containers before conversion may not support +    # metadata (3gp, webm, etc.) +    # And this post-processor should be placed before other metadata +    # manipulating post-processors (FFmpegEmbedSubtitle) to prevent loss of +    # extra metadata. By default ffmpeg preserves metadata applicable for both +    # source and target containers. From this point the container won't change, +    # so metadata can be added here. +    if opts.addmetadata: +        postprocessors.append({'key': 'FFmpegMetadata'})      if opts.convertsubtitles:          postprocessors.append({              'key': 'FFmpegSubtitlesConvertor', @@ -442,7 +449,7 @@ def _real_main(argv=None):          try:              if opts.load_info_filename is not None: -                retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) +                retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename))              else:                  retcode = ydl.download(all_urls)          except MaxDownloadsReached: diff --git a/youtube_dl/cache.py b/youtube_dl/cache.py index 5fe839eb1..7bdade1bd 100644 --- a/youtube_dl/cache.py +++ b/youtube_dl/cache.py @@ -8,8 +8,11 @@ import re  import shutil  import traceback -from .compat import compat_expanduser, compat_getenv -from .utils import write_json_file +from .compat import compat_getenv +from .utils import ( +    expand_path, +    write_json_file, +)  class Cache(object): @@ -21,7 +24,7 @@ class Cache(object):          if res is None:              cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache')              res = os.path.join(cache_root, 'youtube-dl') -        return compat_expanduser(res) +        return expand_path(res)      def _get_cache_fn(self, section, key, dtype):          assert re.match(r'^[a-zA-Z0-9_.-]+$', section), \ diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index bdd3545a2..e13cf547d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,7 +6,10 @@ import sys  import re  from .common import FileDownloader -from ..compat import compat_setenv +from ..compat import ( +    compat_setenv, +    compat_str, +)  from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS  from ..utils import (      cli_option, @@ -270,6 +273,10 @@ class FFmpegFD(ExternalFD):                  args += ['-rtmp_live', 'live']          args += ['-i', url, '-c', 'copy'] + +        if self.params.get('test', False): +            args += ['-fs', compat_str(self._TEST_FILE_SIZE)] +          if protocol in ('m3u8', 'm3u8_native'):              if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':                  args += ['-f', 'mpegts'] diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 4989abce1..7534e4da5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,6 +30,15 @@ class HlsFD(FragmentFD):      FD_NAME = 'hlsnative' +    def _delegate_to_ffmpeg(self, filename, info_dict): +        self.report_warning( +            'hlsnative has detected features it does not support, ' +            'extraction will be delegated to ffmpeg') +        fd = FFmpegFD(self.ydl, self.params) +        for ph in self._progress_hooks: +            fd.add_progress_hook(ph) +        return fd.real_download(filename, info_dict) +      @staticmethod      def can_download(manifest, info_dict):          UNSUPPORTED_FEATURES = ( @@ -53,10 +62,12 @@ class HlsFD(FragmentFD):          )          check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]          check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) -        check_results.append(not info_dict.get('is_live'))          return all(check_results)      def real_download(self, filename, info_dict): +        if info_dict.get('is_live'): +            return self._delegate_to_ffmpeg(filename, info_dict) +          man_url = info_dict['url']          self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) @@ -68,13 +79,7 @@ class HlsFD(FragmentFD):              if info_dict.get('extra_param_to_segment_url'):                  self.report_error('pycrypto not found. Please install it.')                  return False -            self.report_warning( -                'hlsnative has detected features it does not support, ' -                'extraction will be delegated to ffmpeg') -            fd = FFmpegFD(self.ydl, self.params) -            for ph in self._progress_hooks: -                fd.add_progress_hook(ph) -            return fd.real_download(filename, info_dict) +            return self._delegate_to_ffmpeg(filename, info_dict)          total_frags = 0          for line in s.splitlines(): diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 55a9322a7..9f8a71262 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -25,7 +25,8 @@ class AddAnimeIE(InfoExtractor):              'ext': 'mp4',              'description': 'One Piece 606',              'title': 'One Piece 606', -        } +        }, +        'skip': 'Video is gone',      }, {          'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687',          'only_matching': True, diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 4d655bd5e..1b2d364ca 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -36,6 +36,11 @@ MSO_INFO = {          'username_field': 'Ecom_User_ID',          'password_field': 'Ecom_Password',      }, +    'Charter_Direct': { +        'name': 'Charter Spectrum', +        'username_field': 'IDToken1', +        'password_field': 'IDToken2', +    },      'thr030': {          'name': '3 Rivers Communications'      }, @@ -1453,6 +1458,8 @@ class AdobePassIE(InfoExtractor):                      self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {})                      count += 1                      continue +                if '<error' in authorize: +                    raise ExtractorError(xml_text(authorize, 'details'), expected=True)                  authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))                  requestor_info[guid] = authz_token                  self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index e0a0f7c57..b774d6db8 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -4,15 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import ( -    compat_urllib_parse_urlparse, -    compat_urlparse, -) +from ..compat import compat_xpath  from ..utils import (      ExtractorError,      int_or_none, -    update_url_query, -    xpath_element,      xpath_text,  ) @@ -43,7 +38,8 @@ class AfreecaTVIE(InfoExtractor):              'uploader': 'dailyapril',              'uploader_id': 'dailyapril',              'upload_date': '20160503', -        } +        }, +        'skip': 'Video is gone',      }, {          'url': 'http://afbbs.afreecatv.com:8080/app/read_ucc_bbs.cgi?nStationNo=16711924&nTitleNo=36153164&szBjId=dailyapril&nBbsNo=18605867',          'info_dict': { @@ -71,6 +67,19 @@ class AfreecaTVIE(InfoExtractor):                  'upload_date': '20160502',              },          }], +        'skip': 'Video is gone', +    }, { +        'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', +        'info_dict': { +            'id': '18650793', +            'ext': 'flv', +            'uploader': '윈아디', +            'uploader_id': 'badkids', +            'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', +        }, +        'params': { +            'skip_download': True,  # requires rtmpdump +        },      }, {          'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',          'only_matching': True, @@ -90,40 +99,33 @@ class AfreecaTVIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        parsed_url = compat_urllib_parse_urlparse(url) -        info_url = compat_urlparse.urlunparse(parsed_url._replace( -            netloc='afbbs.afreecatv.com:8080', -            path='/api/video/get_video_info.php'))          video_xml = self._download_xml( -            update_url_query(info_url, {'nTitleNo': video_id}), video_id) +            'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', +            video_id, query={'nTitleNo': video_id}) -        if xpath_element(video_xml, './track/video/file') is None: +        video_element = video_xml.findall(compat_xpath('./track/video'))[1] +        if video_element is None or video_element.text is None:              raise ExtractorError('Specified AfreecaTV video does not exist',                                   expected=True) -        title = xpath_text(video_xml, './track/title', 'title') +        video_url_raw = video_element.text + +        app, playpath = video_url_raw.split('mp4:') + +        title = xpath_text(video_xml, './track/title', 'title', fatal=True)          uploader = xpath_text(video_xml, './track/nickname', 'uploader')          uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')          duration = int_or_none(xpath_text(video_xml, './track/duration',                                            'duration'))          thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') -        entries = [] -        for i, video_file in enumerate(video_xml.findall('./track/video/file')): -            video_key = self.parse_video_key(video_file.get('key', '')) -            if not video_key: -                continue -            entries.append({ -                'id': '%s_%s' % (video_id, video_key.get('part', i + 1)), -                'title': title, -                'upload_date': video_key.get('upload_date'), -                'duration': int_or_none(video_file.get('duration')), -                'url': video_file.text, -            }) - -        info = { +        return {              'id': video_id, +            'url': app, +            'ext': 'flv', +            'play_path': 'mp4:' + playpath, +            'rtmp_live': True,  # downloading won't end without this              'title': title,              'uploader': uploader,              'uploader_id': uploader_id, @@ -131,20 +133,6 @@ class AfreecaTVIE(InfoExtractor):              'thumbnail': thumbnail,          } -        if len(entries) > 1: -            info['_type'] = 'multi_video' -            info['entries'] = entries -        elif len(entries) == 1: -            info['url'] = entries[0]['url'] -            info['upload_date'] = entries[0].get('upload_date') -        else: -            raise ExtractorError( -                'No files found for the specified AfreecaTV video, either' -                ' the URL is incorrect or the video has been made private.', -                expected=True) - -        return info -  class AfreecaTVGlobalIE(AfreecaTVIE):      IE_NAME = 'afreecatv:global' diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index 50ffb442d..4495ddbb0 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -93,8 +93,7 @@ class ArkenaIE(InfoExtractor):                  exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))                  if kind == 'm3u8' or 'm3u8' in exts:                      formats.extend(self._extract_m3u8_formats( -                        f_url, video_id, 'mp4', -                        entry_protocol='m3u8' if is_live else 'm3u8_native', +                        f_url, video_id, 'mp4', 'm3u8_native',                          m3u8_id=kind, fatal=False, live=is_live))                  elif kind == 'flash' or 'f4m' in exts:                      formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index e3c669830..99af6dc5a 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -90,7 +90,8 @@ class AtresPlayerIE(InfoExtractor):              request, None, 'Logging in as %s' % username)          error = self._html_search_regex( -            r'(?s)<ul class="list_error">(.+?)</ul>', response, 'error', default=None) +            r'(?s)<ul[^>]+class="[^"]*\blist_error\b[^"]*">(.+?)</ul>', +            response, 'error', default=None)          if error:              raise ExtractorError(                  'Unable to login: %s' % error, expected=True) @@ -155,13 +156,17 @@ class AtresPlayerIE(InfoExtractor):              if format_id == 'token' or not video_url.startswith('http'):                  continue              if 'geodeswowsmpra3player' in video_url: -                f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] -                f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path) +                # f4m_path = video_url.split('smil:', 1)[-1].split('free_', 1)[0] +                # f4m_url = 'http://drg.antena3.com/{0}hds/es/sd.f4m'.format(f4m_path)                  # this videos are protected by DRM, the f4m downloader doesn't support them                  continue -            else: -                f4m_url = video_url[:-9] + '/manifest.f4m' -            formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) +            video_url_hd = video_url.replace('free_es', 'es') +            formats.extend(self._extract_f4m_formats( +                video_url_hd[:-9] + '/manifest.f4m', video_id, f4m_id='hds', +                fatal=False)) +            formats.extend(self._extract_mpd_formats( +                video_url_hd[:-9] + '/manifest.mpd', video_id, mpd_id='dash', +                fatal=False))          self._sort_formats(formats)          path_data = player.get('pathData') diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py new file mode 100644 index 000000000..1584d53fc --- /dev/null +++ b/youtube_dl/extractor/atvat.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    int_or_none, +    unescapeHTML, +) + + +class ATVAtIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?atv\.at/(?:[^/]+/){2}(?P<id>[dv]\d+)' +    _TESTS = [{ +        'url': 'http://atv.at/aktuell/di-210317-2005-uhr/v1698449/', +        'md5': 'c3b6b975fb3150fc628572939df205f2', +        'info_dict': { +            'id': '1698447', +            'ext': 'mp4', +            'title': 'DI, 21.03.17 | 20:05 Uhr 1/1', +        } +    }, { +        'url': 'http://atv.at/aktuell/meinrad-knapp/d8416/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        video_data = self._parse_json(unescapeHTML(self._search_regex( +            r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', +            webpage, 'player data')), display_id)['config']['initial_video'] + +        video_id = video_data['id'] +        video_title = video_data['title'] + +        parts = [] +        for part in video_data.get('parts', []): +            part_id = part['id'] +            part_title = part['title'] + +            formats = [] +            for source in part.get('sources', []): +                source_url = source.get('src') +                if not source_url: +                    continue +                ext = determine_ext(source_url) +                if ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        source_url, part_id, 'mp4', 'm3u8_native', +                        m3u8_id='hls', fatal=False)) +                else: +                    formats.append({ +                        'format_id': source.get('delivery'), +                        'url': source_url, +                    }) +            self._sort_formats(formats) + +            parts.append({ +                'id': part_id, +                'title': part_title, +                'thumbnail': part.get('preview_image_url'), +                'duration': int_or_none(part.get('duration')), +                'is_live': part.get('is_livestream'), +                'formats': formats, +            }) + +        return { +            '_type': 'multi_video', +            'id': video_id, +            'title': video_title, +            'entries': parts, +        } diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 1f5b6ed92..8820a3914 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -21,10 +21,11 @@ class BellMediaIE(InfoExtractor):                  animalplanet|                  bravo|                  mtv| -                space +                space| +                etalk              )\.ca|              much\.com -        )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' +        )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''      _TESTS = [{          'url': 'http://www.ctv.ca/video/player?vid=706966',          'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -58,6 +59,9 @@ class BellMediaIE(InfoExtractor):      }, {          'url': 'http://www.ctv.ca/DCs-Legends-of-Tomorrow/Video/S2E11-Turncoat-vid1051430',          'only_matching': True, +    }, { +        'url': 'http://www.etalk.ca/video?videoid=663455', +        'only_matching': True,      }]      _DOMAINS = {          'thecomedynetwork': 'comedy', @@ -65,6 +69,7 @@ class BellMediaIE(InfoExtractor):          'sciencechannel': 'discsci',          'investigationdiscovery': 'invdisc',          'animalplanet': 'aniplan', +        'etalk': 'ctv',      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/bostonglobe.py b/youtube_dl/extractor/bostonglobe.py new file mode 100644 index 000000000..57882fbee --- /dev/null +++ b/youtube_dl/extractor/bostonglobe.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( +    extract_attributes, +) + + +class BostonGlobeIE(InfoExtractor): +    _VALID_URL = r'(?i)https?://(?:www\.)?bostonglobe\.com/.*/(?P<id>[^/]+)/\w+(?:\.html)?' +    _TESTS = [ +        { +            'url': 'http://www.bostonglobe.com/metro/2017/02/11/tree-finally-succumbs-disease-leaving-hole-neighborhood/h1b4lviqzMTIn9sVy8F3gP/story.html', +            'md5': '0a62181079c85c2d2b618c9a738aedaf', +            'info_dict': { +                'title': 'A tree finally succumbs to disease, leaving a hole in a neighborhood', +                'id': '5320421710001', +                'ext': 'mp4', +                'description': 'It arrived as a sapling when the Back Bay was in its infancy, a spindly American elm tamped down into a square of dirt cut into the brick sidewalk of 1880s Marlborough Street, no higher than the first bay window of the new brownstone behind it.', +                'timestamp': 1486877593, +                'upload_date': '20170212', +                'uploader_id': '245991542', +            }, +        }, +        { +            # Embedded youtube video; we hand it off to the Generic extractor. +            'url': 'https://www.bostonglobe.com/lifestyle/names/2017/02/17/does-ben-affleck-play-matt-damon-favorite-version-batman/ruqkc9VxKBYmh5txn1XhSI/story.html', +            'md5': '582b40327089d5c0c949b3c54b13c24b', +            'info_dict': { +                'title': "Who Is Matt Damon's Favorite Batman?", +                'id': 'ZW1QCnlA6Qc', +                'ext': 'mp4', +                'upload_date': '20170217', +                'description': 'md5:3b3dccb9375867e0b4d527ed87d307cb', +                'uploader': 'The Late Late Show with James Corden', +                'uploader_id': 'TheLateLateShow', +            }, +            'expected_warnings': ['404'], +        }, +    ] + +    def _real_extract(self, url): +        page_id = self._match_id(url) +        webpage = self._download_webpage(url, page_id) + +        page_title = self._og_search_title(webpage, default=None) + +        # <video data-brightcove-video-id="5320421710001" data-account="245991542" data-player="SJWAiyYWg" data-embed="default" class="video-js" controls itemscope itemtype="http://schema.org/VideoObject"> +        entries = [] +        for video in re.findall(r'(?i)(<video[^>]+>)', webpage): +            attrs = extract_attributes(video) + +            video_id = attrs.get('data-brightcove-video-id') +            account_id = attrs.get('data-account') +            player_id = attrs.get('data-player') +            embed = attrs.get('data-embed') + +            if video_id and account_id and player_id and embed: +                entries.append( +                    'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' +                    % (account_id, player_id, embed, video_id)) + +        if len(entries) == 0: +            return self.url_result(url, 'Generic') +        elif len(entries) == 1: +            return self.url_result(entries[0], 'BrightcoveNew') +        else: +            return self.playlist_from_matches(entries, page_id, page_title, ie='BrightcoveNew') diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 27685eed0..46ef8e605 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -193,7 +193,13 @@ class BrightcoveLegacyIE(InfoExtractor):          if videoPlayer is not None:              if isinstance(videoPlayer, list):                  videoPlayer = videoPlayer[0] -            if not (videoPlayer.isdigit() or videoPlayer.startswith('ref:')): +            videoPlayer = videoPlayer.strip() +            # UUID is also possible for videoPlayer (e.g. +            # http://www.popcornflix.com/hoodies-vs-hooligans/7f2d2b87-bbf2-4623-acfb-ea942b4f01dd +            # or http://www8.hp.com/cn/zh/home.html) +            if not (re.match( +                    r'^(?:\d+|[\da-fA-F]{8}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{4}-?[\da-fA-F]{12})$', +                    videoPlayer) or videoPlayer.startswith('ref:')):                  return None              params['@videoPlayer'] = videoPlayer          linkBase = find_param('linkBaseURL') @@ -515,6 +521,9 @@ class BrightcoveNewIE(InfoExtractor):          return entries      def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +        self._initialize_geo_bypass(smuggled_data.get('geo_countries')) +          account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()          webpage = self._download_webpage( @@ -544,8 +553,10 @@ class BrightcoveNewIE(InfoExtractor):          except ExtractorError as e:              if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:                  json_data = self._parse_json(e.cause.read().decode(), video_id)[0] -                raise ExtractorError( -                    json_data.get('message') or json_data['error_code'], expected=True) +                message = json_data.get('message') or json_data['error_code'] +                if json_data.get('error_subcode') == 'CLIENT_GEO': +                    self.raise_geo_restricted(msg=message) +                raise ExtractorError(message, expected=True)              raise          title = json_data['name'].strip() diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b1dfacf80..dd2529a6d 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -160,8 +160,7 @@ class CeskaTelevizeIE(InfoExtractor):                  for format_id, stream_url in item.get('streamUrls', {}).items():                      if 'playerType=flash' in stream_url:                          stream_formats = self._extract_m3u8_formats( -                            stream_url, playlist_id, 'mp4', -                            entry_protocol='m3u8' if is_live else 'm3u8_native', +                            stream_url, playlist_id, 'mp4', 'm3u8_native',                              m3u8_id='hls-%s' % format_id, fatal=False)                      else:                          stream_formats = self._extract_mpd_formats( diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 865dbcaba..e92894246 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -4,62 +4,62 @@ import re  from .common import InfoExtractor  from ..utils import ( +    clean_html,      ExtractorError, -    parse_filesize, +    int_or_none, +    parse_iso8601,      qualities, +    unescapeHTML,  )  class Channel9IE(InfoExtractor): -    ''' -    Common extractor for channel9.msdn.com. - -    The type of provided URL (video or playlist) is determined according to -    meta Search.PageType from web page HTML rather than URL itself, as it is -    not always possible to do. -    '''      IE_DESC = 'Channel 9'      IE_NAME = 'channel9' -    _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' +    _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'      _TESTS = [{          'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', -        'md5': 'bbd75296ba47916b754e73c3a4bbdf10', +        'md5': '32083d4eaf1946db6d454313f44510ca',          'info_dict': { -            'id': 'Events/TechEd/Australia/2013/KOS002', -            'ext': 'mp4', +            'id': '6c413323-383a-49dc-88f9-a22800cab024', +            'ext': 'wmv',              'title': 'Developer Kick-Off Session: Stuff We Love', -            'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', +            'description': 'md5:b80bf9355a503c193aff7ec6cd5a7731',              'duration': 4576, -            'thumbnail': r're:http://.*\.jpg', +            'thumbnail': r're:https?://.*\.jpg', +            'timestamp': 1377717420, +            'upload_date': '20130828',              'session_code': 'KOS002', -            'session_day': 'Day 1',              'session_room': 'Arena 1A', -            'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug', -                                 'Mads Kristensen'], +            'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],          },      }, {          'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing', -        'md5': 'b43ee4529d111bc37ba7ee4f34813e68', +        'md5': 'dcf983ee6acd2088e7188c3cf79b46bc',          'info_dict': { -            'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing', -            'ext': 'mp4', +            'id': 'fe8e435f-bb93-4e01-8e97-a28c01887024', +            'ext': 'wmv',              'title': 'Self-service BI with Power BI - nuclear testing', -            'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', +            'description': 'md5:2d17fec927fc91e9e17783b3ecc88f54',              'duration': 1540, -            'thumbnail': r're:http://.*\.jpg', +            'thumbnail': r're:https?://.*\.jpg', +            'timestamp': 1386381991, +            'upload_date': '20131207',              'authors': ['Mike Wilmot'],          },      }, {          # low quality mp4 is best          'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',          'info_dict': { -            'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', +            'id': '33ad69d2-6a4e-4172-83a1-a523013dec76',              'ext': 'mp4',              'title': 'Ranges for the Standard Library', -            'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', +            'description': 'md5:9895e0a9fd80822d2f01c454b8f4a372',              'duration': 5646, -            'thumbnail': r're:http://.*\.jpg', +            'thumbnail': r're:https?://.*\.jpg', +            'upload_date': '20150930', +            'timestamp': 1443640735,          },          'params': {              'skip_download': True, @@ -70,7 +70,7 @@ class Channel9IE(InfoExtractor):              'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',              'title': 'Channel 9',          }, -        'playlist_count': 2, +        'playlist_mincount': 100,      }, {          'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',          'only_matching': True, @@ -81,189 +81,6 @@ class Channel9IE(InfoExtractor):      _RSS_URL = 'http://channel9.msdn.com/%s/RSS' -    def _formats_from_html(self, html): -        FORMAT_REGEX = r''' -            (?x) -            <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s* -            <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s* -            (?:<div\s+class="popup\s+rounded">\s* -            <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s* -            </div>)?                                                # File size part may be missing -        ''' -        quality = qualities(( -            'MP3', 'MP4', -            'Low Quality WMV', 'Low Quality MP4', -            'Mid Quality WMV', 'Mid Quality MP4', -            'High Quality WMV', 'High Quality MP4')) -        formats = [{ -            'url': x.group('url'), -            'format_id': x.group('quality'), -            'format_note': x.group('note'), -            'format': '%s (%s)' % (x.group('quality'), x.group('note')), -            'filesize_approx': parse_filesize(x.group('filesize')), -            'quality': quality(x.group('quality')), -            'vcodec': 'none' if x.group('note') == 'Audio only' else None, -        } for x in list(re.finditer(FORMAT_REGEX, html))] - -        self._sort_formats(formats) - -        return formats - -    def _extract_title(self, html): -        title = self._html_search_meta('title', html, 'title') -        if title is None: -            title = self._og_search_title(html) -            TITLE_SUFFIX = ' (Channel 9)' -            if title is not None and title.endswith(TITLE_SUFFIX): -                title = title[:-len(TITLE_SUFFIX)] -        return title - -    def _extract_description(self, html): -        DESCRIPTION_REGEX = r'''(?sx) -            <div\s+class="entry-content">\s* -            <div\s+id="entry-body">\s* -            (?P<description>.+?)\s* -            </div>\s* -            </div> -        ''' -        m = re.search(DESCRIPTION_REGEX, html) -        if m is not None: -            return m.group('description') -        return self._html_search_meta('description', html, 'description') - -    def _extract_duration(self, html): -        m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html) -        return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None - -    def _extract_slides(self, html): -        m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html) -        return m.group('slidesurl') if m is not None else None - -    def _extract_zip(self, html): -        m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html) -        return m.group('zipurl') if m is not None else None - -    def _extract_avg_rating(self, html): -        m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html) -        return float(m.group('avgrating')) if m is not None else 0 - -    def _extract_rating_count(self, html): -        m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html) -        return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0 - -    def _extract_view_count(self, html): -        m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html) -        return int(self._fix_count(m.group('viewcount'))) if m is not None else 0 - -    def _extract_comment_count(self, html): -        m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html) -        return int(self._fix_count(m.group('commentcount'))) if m is not None else 0 - -    def _fix_count(self, count): -        return int(str(count).replace(',', '')) if count is not None else None - -    def _extract_authors(self, html): -        m = re.search(r'(?s)<li class="author">(.*?)</li>', html) -        if m is None: -            return None -        return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1)) - -    def _extract_session_code(self, html): -        m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html) -        return m.group('code') if m is not None else None - -    def _extract_session_day(self, html): -        m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html) -        return m.group('day').strip() if m is not None else None - -    def _extract_session_room(self, html): -        m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html) -        return m.group('room') if m is not None else None - -    def _extract_session_speakers(self, html): -        return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html) - -    def _extract_content(self, html, content_path): -        # Look for downloadable content -        formats = self._formats_from_html(html) -        slides = self._extract_slides(html) -        zip_ = self._extract_zip(html) - -        # Nothing to download -        if len(formats) == 0 and slides is None and zip_ is None: -            self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path) -            return - -        # Extract meta -        title = self._extract_title(html) -        description = self._extract_description(html) -        thumbnail = self._og_search_thumbnail(html) -        duration = self._extract_duration(html) -        avg_rating = self._extract_avg_rating(html) -        rating_count = self._extract_rating_count(html) -        view_count = self._extract_view_count(html) -        comment_count = self._extract_comment_count(html) - -        common = { -            '_type': 'video', -            'id': content_path, -            'description': description, -            'thumbnail': thumbnail, -            'duration': duration, -            'avg_rating': avg_rating, -            'rating_count': rating_count, -            'view_count': view_count, -            'comment_count': comment_count, -        } - -        result = [] - -        if slides is not None: -            d = common.copy() -            d.update({'title': title + '-Slides', 'url': slides}) -            result.append(d) - -        if zip_ is not None: -            d = common.copy() -            d.update({'title': title + '-Zip', 'url': zip_}) -            result.append(d) - -        if len(formats) > 0: -            d = common.copy() -            d.update({'title': title, 'formats': formats}) -            result.append(d) - -        return result - -    def _extract_entry_item(self, html, content_path): -        contents = self._extract_content(html, content_path) -        if contents is None: -            return contents - -        if len(contents) > 1: -            raise ExtractorError('Got more than one entry') -        result = contents[0] -        result['authors'] = self._extract_authors(html) - -        return result - -    def _extract_session(self, html, content_path): -        contents = self._extract_content(html, content_path) -        if contents is None: -            return contents - -        session_meta = { -            'session_code': self._extract_session_code(html), -            'session_day': self._extract_session_day(html), -            'session_room': self._extract_session_room(html), -            'session_speakers': self._extract_session_speakers(html), -        } - -        for content in contents: -            content.update(session_meta) - -        return self.playlist_result(contents) -      def _extract_list(self, video_id, rss_url=None):          if not rss_url:              rss_url = self._RSS_URL % video_id @@ -274,9 +91,7 @@ class Channel9IE(InfoExtractor):          return self.playlist_result(entries, video_id, title_text)      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        content_path = mobj.group('contentpath') -        rss = mobj.group('rss') +        content_path, rss = re.match(self._VALID_URL, url).groups()          if rss:              return self._extract_list(content_path, url) @@ -284,17 +99,158 @@ class Channel9IE(InfoExtractor):          webpage = self._download_webpage(              url, content_path, 'Downloading web page') -        page_type = self._search_regex( -            r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2', -            webpage, 'page type', default=None, group='pagetype') -        if page_type: -            if page_type == 'Entry':      # Any 'item'-like page, may contain downloadable content -                return self._extract_entry_item(webpage, content_path) -            elif page_type == 'Session':  # Event session page, may contain downloadable content -                return self._extract_session(webpage, content_path) -            elif page_type == 'Event': -                return self._extract_list(content_path) +        episode_data = self._search_regex( +            r"data-episode='([^']+)'", webpage, 'episode data', default=None) +        if episode_data: +            episode_data = self._parse_json(unescapeHTML( +                episode_data), content_path) +            content_id = episode_data['contentId'] +            is_session = '/Sessions(' in episode_data['api'] +            content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] +            if is_session: +                content_url += '?$expand=Speakers' +            else: +                content_url += '?$expand=Authors' +            content_data = self._download_json(content_url, content_id) +            title = content_data['Title'] + +            QUALITIES = ( +                'mp3', +                'wmv', 'mp4', +                'wmv-low', 'mp4-low', +                'wmv-mid', 'mp4-mid', +                'wmv-high', 'mp4-high', +            ) + +            quality_key = qualities(QUALITIES) + +            def quality(quality_id, format_url): +                return (len(QUALITIES) if '_Source.' in format_url +                        else quality_key(quality_id)) + +            formats = [] +            urls = set() + +            SITE_QUALITIES = { +                'MP3': 'mp3', +                'MP4': 'mp4', +                'Low Quality WMV': 'wmv-low', +                'Low Quality MP4': 'mp4-low', +                'Mid Quality WMV': 'wmv-mid', +                'Mid Quality MP4': 'mp4-mid', +                'High Quality WMV': 'wmv-high', +                'High Quality MP4': 'mp4-high', +            } + +            formats_select = self._search_regex( +                r'(?s)<select[^>]+name=["\']format[^>]+>(.+?)</select', webpage, +                'formats select', default=None) +            if formats_select: +                for mobj in re.finditer( +                        r'<option\b[^>]+\bvalue=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>\s*(?P<format>[^<]+?)\s*<', +                        formats_select): +                    format_url = mobj.group('url') +                    if format_url in urls: +                        continue +                    urls.add(format_url) +                    format_id = mobj.group('format') +                    quality_id = SITE_QUALITIES.get(format_id, format_id) +                    formats.append({ +                        'url': format_url, +                        'format_id': quality_id, +                        'quality': quality(quality_id, format_url), +                        'vcodec': 'none' if quality_id == 'mp3' else None, +                    }) + +            API_QUALITIES = { +                'VideoMP4Low': 'mp4-low', +                'VideoWMV': 'wmv-mid', +                'VideoMP4Medium': 'mp4-mid', +                'VideoMP4High': 'mp4-high', +                'VideoWMVHQ': 'wmv-hq', +            } + +            for format_id, q in API_QUALITIES.items(): +                q_url = content_data.get(format_id) +                if not q_url or q_url in urls: +                    continue +                urls.add(q_url) +                formats.append({ +                    'url': q_url, +                    'format_id': q, +                    'quality': quality(q, q_url), +                }) + +            self._sort_formats(formats) + +            slides = content_data.get('Slides') +            zip_file = content_data.get('ZipFile') + +            if not formats and not slides and not zip_file: +                raise ExtractorError( +                    'None of recording, slides or zip are available for %s' % content_path) + +            subtitles = {} +            for caption in content_data.get('Captions', []): +                caption_url = caption.get('Url') +                if not caption_url: +                    continue +                subtitles.setdefault(caption.get('Language', 'en'), []).append({ +                    'url': caption_url, +                    'ext': 'vtt', +                }) + +            common = { +                'id': content_id, +                'title': title, +                'description': clean_html(content_data.get('Description') or content_data.get('Body')), +                'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'), +                'duration': int_or_none(content_data.get('MediaLengthInSeconds')), +                'timestamp': parse_iso8601(content_data.get('PublishedDate')), +                'avg_rating': int_or_none(content_data.get('Rating')), +                'rating_count': int_or_none(content_data.get('RatingCount')), +                'view_count': int_or_none(content_data.get('Views')), +                'comment_count': int_or_none(content_data.get('CommentCount')), +                'subtitles': subtitles, +            } +            if is_session: +                speakers = [] +                for s in content_data.get('Speakers', []): +                    speaker_name = s.get('FullName') +                    if not speaker_name: +                        continue +                    speakers.append(speaker_name) + +                common.update({ +                    'session_code': content_data.get('Code'), +                    'session_room': content_data.get('Room'), +                    'session_speakers': speakers, +                })              else: -                raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True) -        else:  # Assuming list +                authors = [] +                for a in content_data.get('Authors', []): +                    author_name = a.get('DisplayName') +                    if not author_name: +                        continue +                    authors.append(author_name) +                common['authors'] = authors + +            contents = [] + +            if slides: +                d = common.copy() +                d.update({'title': title + '-Slides', 'url': slides}) +                contents.append(d) + +            if zip_file: +                d = common.copy() +                d.update({'title': title + '-Zip', 'url': zip_file}) +                contents.append(d) + +            if formats: +                d = common.copy() +                d.update({'title': title, 'formats': formats}) +                contents.append(d) +            return self.playlist_result(contents) +        else:              return self._extract_list(content_path) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index ae5ba0015..9bc8dbea4 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -1,97 +1,56 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..compat import ( -    compat_parse_qs, -    compat_HTTPError, -)  from ..utils import ( -    ExtractorError, -    HEADRequest, -    remove_end, +    str_to_int, +    unified_strdate,  )  class CloudyIE(InfoExtractor):      _IE_DESC = 'cloudy.ec' -    _VALID_URL = r'''(?x) -        https?://(?:www\.)?cloudy\.ec/ -        (?:v/|embed\.php\?id=) -        (?P<id>[A-Za-z0-9]+) -        ''' -    _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s' -    _API_URL = 'http://www.cloudy.ec/api/player.api.php' -    _MAX_TRIES = 2 -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?cloudy\.ec/(?:v/|embed\.php\?.*?\bid=)(?P<id>[A-Za-z0-9]+)' +    _TESTS = [{          'url': 'https://www.cloudy.ec/v/af511e2527aac', -        'md5': '5cb253ace826a42f35b4740539bedf07', +        'md5': '29832b05028ead1b58be86bf319397ca',          'info_dict': {              'id': 'af511e2527aac', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Funny Cats and Animals Compilation june 2013', +            'upload_date': '20130913', +            'view_count': int,          } -    } - -    def _extract_video(self, video_id, file_key, error_url=None, try_num=0): - -        if try_num > self._MAX_TRIES - 1: -            raise ExtractorError('Unable to extract video URL', expected=True) - -        form = { -            'file': video_id, -            'key': file_key, -        } - -        if error_url: -            form.update({ -                'numOfErrors': try_num, -                'errorCode': '404', -                'errorUrl': error_url, -            }) +    }, { +        'url': 'http://www.cloudy.ec/embed.php?autoplay=1&id=af511e2527aac', +        'only_matching': True, +    }] -        player_data = self._download_webpage( -            self._API_URL, video_id, 'Downloading player data', query=form) -        data = compat_parse_qs(player_data) - -        try_num += 1 - -        if 'error' in data: -            raise ExtractorError( -                '%s error: %s' % (self.IE_NAME, ' '.join(data['error_msg'])), -                expected=True) +    def _real_extract(self, url): +        video_id = self._match_id(url) -        title = data.get('title', [None])[0] -        if title: -            title = remove_end(title, '&asdasdas').strip() +        webpage = self._download_webpage( +            'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id) -        video_url = data.get('url', [None])[0] +        info = self._parse_html5_media_entries(url, webpage, video_id)[0] -        if video_url: -            try: -                self._request_webpage(HEADRequest(video_url), video_id, 'Checking video URL') -            except ExtractorError as e: -                if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: -                    self.report_warning('Invalid video URL, requesting another', video_id) -                    return self._extract_video(video_id, file_key, video_url, try_num) +        webpage = self._download_webpage( +            'https://www.cloudy.ec/v/%s' % video_id, video_id, fatal=False) -        return { -            'id': video_id, -            'url': video_url, -            'title': title, -        } - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        if webpage: +            info.update({ +                'title': self._search_regex( +                    r'<h\d[^>]*>([^<]+)<', webpage, 'title'), +                'upload_date': unified_strdate(self._search_regex( +                    r'>Published at (\d{4}-\d{1,2}-\d{1,2})', webpage, +                    'upload date', fatal=False)), +                'view_count': str_to_int(self._search_regex( +                    r'([\d,.]+) views<', webpage, 'view count', fatal=False)), +            }) -        url = self._EMBED_URL % video_id -        webpage = self._download_webpage(url, video_id) +        if not info.get('title'): +            info['title'] = video_id -        file_key = self._search_regex( -            [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], -            webpage, 'file_key') +        info['id'] = video_id -        return self._extract_video(video_id, file_key) +        return info diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c2ca73ee1..6c3c095f7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -36,34 +36,35 @@ from ..utils import (      clean_html,      compiled_regex_type,      determine_ext, +    determine_protocol,      error_to_compat_str,      ExtractorError, +    extract_attributes,      fix_xml_ampersands,      float_or_none,      GeoRestrictedError,      GeoUtils,      int_or_none,      js_to_json, +    mimetype2ext, +    orderedSet, +    parse_codecs, +    parse_duration,      parse_iso8601, +    parse_m3u8_attributes,      RegexNotFoundError, -    sanitize_filename,      sanitized_Request, +    sanitize_filename,      unescapeHTML,      unified_strdate,      unified_timestamp, +    update_Request, +    update_url_query, +    urljoin,      url_basename,      xpath_element,      xpath_text,      xpath_with_ns, -    determine_protocol, -    parse_duration, -    mimetype2ext, -    update_Request, -    update_url_query, -    parse_m3u8_attributes, -    extract_attributes, -    parse_codecs, -    urljoin,  ) @@ -714,6 +715,13 @@ class InfoExtractor(object):              video_info['title'] = video_title          return video_info +    def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): +        urlrs = orderedSet( +            self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) +            for m in matches) +        return self.playlist_result( +            urlrs, playlist_id=video_id, playlist_title=video_title) +      @staticmethod      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):          """Returns a playlist""" @@ -2204,56 +2212,9 @@ class InfoExtractor(object):              this_video_id = video_id or video_data['mediaid'] -            formats = [] -            for source in video_data['sources']: -                source_url = self._proto_relative_url(source['file']) -                if base_url: -                    source_url = compat_urlparse.urljoin(base_url, source_url) -                source_type = source.get('type') or '' -                ext = mimetype2ext(source_type) or determine_ext(source_url) -                if source_type == 'hls' or ext == 'm3u8': -                    formats.extend(self._extract_m3u8_formats( -                        source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) -                elif ext == 'mpd': -                    formats.extend(self._extract_mpd_formats( -                        source_url, this_video_id, mpd_id=mpd_id, fatal=False)) -                # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 -                elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): -                    formats.append({ -                        'url': source_url, -                        'vcodec': 'none', -                        'ext': ext, -                    }) -                else: -                    height = int_or_none(source.get('height')) -                    if height is None: -                        # Often no height is provided but there is a label in -                        # format like 1080p. -                        height = int_or_none(self._search_regex( -                            r'^(\d{3,})[pP]$', source.get('label') or '', -                            'height', default=None)) -                    a_format = { -                        'url': source_url, -                        'width': int_or_none(source.get('width')), -                        'height': height, -                        'ext': ext, -                    } -                    if source_url.startswith('rtmp'): -                        a_format['ext'] = 'flv' - -                        # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as -                        # of jwplayer.flash.swf -                        rtmp_url_parts = re.split( -                            r'((?:mp4|mp3|flv):)', source_url, 1) -                        if len(rtmp_url_parts) == 3: -                            rtmp_url, prefix, play_path = rtmp_url_parts -                            a_format.update({ -                                'url': rtmp_url, -                                'play_path': prefix + play_path, -                            }) -                        if rtmp_params: -                            a_format.update(rtmp_params) -                    formats.append(a_format) +            formats = self._parse_jwplayer_formats( +                video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, +                mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)              self._sort_formats(formats)              subtitles = {} @@ -2284,6 +2245,65 @@ class InfoExtractor(object):          else:              return self.playlist_result(entries) +    def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, +                                m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): +        formats = [] +        for source in jwplayer_sources_data: +            source_url = self._proto_relative_url(source['file']) +            if base_url: +                source_url = compat_urlparse.urljoin(base_url, source_url) +            source_type = source.get('type') or '' +            ext = mimetype2ext(source_type) or determine_ext(source_url) +            if source_type == 'hls' or ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    source_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id=m3u8_id, fatal=False)) +            elif ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    source_url, video_id, mpd_id=mpd_id, fatal=False)) +            elif ext == 'smil': +                formats.extend(self._extract_smil_formats( +                    source_url, video_id, fatal=False)) +            # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 +            elif source_type.startswith('audio') or ext in ( +                    'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): +                formats.append({ +                    'url': source_url, +                    'vcodec': 'none', +                    'ext': ext, +                }) +            else: +                height = int_or_none(source.get('height')) +                if height is None: +                    # Often no height is provided but there is a label in +                    # format like "1080p", "720p SD", or 1080. +                    height = int_or_none(self._search_regex( +                        r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), +                        'height', default=None)) +                a_format = { +                    'url': source_url, +                    'width': int_or_none(source.get('width')), +                    'height': height, +                    'tbr': int_or_none(source.get('bitrate')), +                    'ext': ext, +                } +                if source_url.startswith('rtmp'): +                    a_format['ext'] = 'flv' +                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as +                    # of jwplayer.flash.swf +                    rtmp_url_parts = re.split( +                        r'((?:mp4|mp3|flv):)', source_url, 1) +                    if len(rtmp_url_parts) == 3: +                        rtmp_url, prefix, play_path = rtmp_url_parts +                        a_format.update({ +                            'url': rtmp_url, +                            'play_path': prefix + play_path, +                        }) +                    if rtmp_params: +                        a_format.update(rtmp_params) +                formats.append(a_format) +        return formats +      def _live_title(self, name):          """ Generate the title for a live video """          now = datetime.datetime.now() diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 8d8f60598..d3463b874 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -9,13 +9,14 @@ from ..compat import (      compat_urlparse,  )  from ..utils import ( -    orderedSet, -    remove_end, -    extract_attributes, -    mimetype2ext,      determine_ext, +    extract_attributes,      int_or_none, +    js_to_json, +    mimetype2ext, +    orderedSet,      parse_iso8601, +    remove_end,  ) @@ -67,6 +68,16 @@ class CondeNastIE(InfoExtractor):              'timestamp': 1363219200,          }      }, { +        'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', +        'info_dict': { +            'id': '58d1865bfd2e6126e2000015', +            'ext': 'mp4', +            'title': 'The Only True Surprise? Trump’s an Idiot', +            'uploader': 'gq', +            'upload_date': '20170321', +            'timestamp': 1490126427, +        }, +    }, {          # JS embed          'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js',          'md5': 'f1a6f9cafb7083bab74a710f65d08999', @@ -114,26 +125,33 @@ class CondeNastIE(InfoExtractor):              })          video_id = query['videoId']          video_info = None -        info_page = self._download_webpage( +        info_page = self._download_json(              'http://player.cnevids.com/player/video.js', -            video_id, 'Downloading video info', query=query, fatal=False) +            video_id, 'Downloading video info', fatal=False, query=query)          if info_page: -            video_info = self._parse_json(self._search_regex( -                r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] -        else: +            video_info = info_page.get('video') +        if not video_info:              info_page = self._download_webpage(                  'http://player.cnevids.com/player/loader.js',                  video_id, 'Downloading loader info', query=query) -            video_info = self._parse_json(self._search_regex( -                r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id) +            video_info = self._parse_json( +                self._search_regex( +                    r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), +                video_id, transform_source=js_to_json)['video'] +          title = video_info['title']          formats = [] -        for fdata in video_info.get('sources', [{}])[0]: +        for fdata in video_info['sources']:              src = fdata.get('src')              if not src:                  continue              ext = mimetype2ext(fdata.get('type')) or determine_ext(src) +            if ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    src, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +                continue              quality = fdata.get('quality')              formats.append({                  'format_id': ext + ('-%s' % quality if quality else ''), @@ -169,7 +187,6 @@ class CondeNastIE(InfoExtractor):                  path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))              url_type = 'embed' -        self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])          webpage = self._download_webpage(url, item_id)          if url_type == 'series': diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9c6cf00ca..d15fd3744 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -177,6 +177,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):              'uploader': 'Kadokawa Pictures Inc.',              'upload_date': '20170118',              'series': "KONOSUBA -God's blessing on this wonderful world!", +            'season': "KONOSUBA -God's blessing on this wonderful world! 2",              'season_number': 2,              'episode': 'Give Me Deliverance from this Judicial Injustice!',              'episode_number': 1, @@ -222,6 +223,23 @@ class CrunchyrollIE(CrunchyrollBaseIE):              # just test metadata extraction              'skip_download': True,          }, +    }, { +        # A video with a vastly different season name compared to the series name +        'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', +        'info_dict': { +            'id': '590532', +            'ext': 'mp4', +            'title': 'Haiyoru! Nyaruani (ONA) Episode 1 – Test', +            'description': 'Mahiro and Nyaruko talk about official certification.', +            'uploader': 'TV TOKYO', +            'upload_date': '20120305', +            'series': 'Nyarko-san: Another Crawling Chaos', +            'season': 'Haiyoru! Nyaruani (ONA)', +        }, +        'params': { +            # Just test metadata extraction +            'skip_download': True, +        },      }]      _FORMAT_IDS = { @@ -491,7 +509,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          # webpage provide more accurate data than series_title from XML          series = self._html_search_regex(              r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)', -            webpage, 'series', default=xpath_text(metadata, 'series_title')) +            webpage, 'series', fatal=False) +        season = xpath_text(metadata, 'series_title')          episode = xpath_text(metadata, 'episode_title')          episode_number = int_or_none(xpath_text(metadata, 'episode_number')) @@ -508,6 +527,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              'uploader': video_uploader,              'upload_date': video_upload_date,              'series': series, +            'season': season,              'season_number': season_number,              'episode': episode,              'episode_number': episode_number, diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index 2042493a8..7cd5d4291 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -1,17 +1,21 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..compat import compat_str  from ..utils import (      extract_attributes, +    ExtractorError,      int_or_none,      parse_age_limit, -    ExtractorError, +    remove_end, +    unescapeHTML,  ) -class DiscoveryGoIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(?:www\.)?(?: +class DiscoveryGoBaseIE(InfoExtractor): +    _VALID_URL_TEMPLATE = r'''(?x)https?://(?:www\.)?(?:              discovery|              investigationdiscovery|              discoverylife| @@ -21,18 +25,23 @@ class DiscoveryGoIE(InfoExtractor):              sciencechannel|              tlc|              velocitychannel -        )go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)''' +        )go\.com/%s(?P<id>[^/?#&]+)''' + + +class DiscoveryGoIE(DiscoveryGoBaseIE): +    _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % r'(?:[^/]+/)+' +    _GEO_COUNTRIES = ['US']      _TEST = { -        'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', +        'url': 'https://www.discoverygo.com/bering-sea-gold/reaper-madness/',          'info_dict': { -            'id': '57a33c536b66d1cd0345eeb1', +            'id': '58c167d86b66d12f2addeb01',              'ext': 'mp4', -            'title': 'Kiss First, Ask Questions Later!', -            'description': 'md5:fe923ba34050eae468bffae10831cb22', -            'duration': 2579, -            'series': 'Love at First Kiss', -            'season_number': 1, -            'episode_number': 1, +            'title': 'Reaper Madness', +            'description': 'md5:09f2c625c99afb8946ed4fb7865f6e78', +            'duration': 2519, +            'series': 'Bering Sea Gold', +            'season_number': 8, +            'episode_number': 6,              'age_limit': 14,          },      } @@ -113,3 +122,46 @@ class DiscoveryGoIE(InfoExtractor):              'formats': formats,              'subtitles': subtitles,          } + + +class DiscoveryGoPlaylistIE(DiscoveryGoBaseIE): +    _VALID_URL = DiscoveryGoBaseIE._VALID_URL_TEMPLATE % '' +    _TEST = { +        'url': 'https://www.discoverygo.com/bering-sea-gold/', +        'info_dict': { +            'id': 'bering-sea-gold', +            'title': 'Bering Sea Gold', +            'description': 'md5:cc5c6489835949043c0cc3ad66c2fa0e', +        }, +        'playlist_mincount': 6, +    } + +    @classmethod +    def suitable(cls, url): +        return False if DiscoveryGoIE.suitable(url) else super( +            DiscoveryGoPlaylistIE, cls).suitable(url) + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        entries = [] +        for mobj in re.finditer(r'data-json=(["\'])(?P<json>{.+?})\1', webpage): +            data = self._parse_json( +                mobj.group('json'), display_id, +                transform_source=unescapeHTML, fatal=False) +            if not isinstance(data, dict) or data.get('type') != 'episode': +                continue +            episode_url = data.get('socialUrl') +            if not episode_url: +                continue +            entries.append(self.url_result( +                episode_url, ie=DiscoveryGoIE.ie_key(), +                video_id=data.get('id'))) + +        return self.playlist_result( +            entries, display_id, +            remove_end(self._og_search_title( +                webpage, fatal=False), ' | Discovery GO'), +            self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/discoverynetworks.py index fd145ba42..b6653784c 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/discoverynetworks.py @@ -9,13 +9,13 @@ from ..compat import (      compat_parse_qs,      compat_urlparse,  ) +from ..utils import smuggle_url -class TlcDeIE(InfoExtractor): -    IE_NAME = 'tlc.de' -    _VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' +class DiscoveryNetworksDeIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?(?:discovery|tlc|animalplanet|dmax)\.de/(?:.*#(?P<id>\d+)|(?:[^/]+/)*videos/(?P<title>[^/?#]+))' -    _TEST = { +    _TESTS = [{          'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',          'info_dict': {              'id': '3235167922001', @@ -29,7 +29,13 @@ class TlcDeIE(InfoExtractor):              'upload_date': '20140404',              'uploader_id': '1659832546',          }, -    } +    }, { +        'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/', +        'only_matching': True, +    }, { +        'url': 'http://www.discovery.de/#5332316765001', +        'only_matching': True, +    }]      BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'      def _real_extract(self, url): @@ -39,5 +45,8 @@ class TlcDeIE(InfoExtractor):              title = mobj.group('title')              webpage = self._download_webpage(url, title)              brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) -            brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] -        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) +            brightcove_id = compat_parse_qs(compat_urlparse.urlparse( +                brightcove_legacy_url).query)['@videoPlayer'][0] +        return self.url_result(smuggle_url( +            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}), +            'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 9a83fb31a..82d8a042f 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -1,6 +1,9 @@  # coding: utf-8  from __future__ import unicode_literals +import time +import hashlib +  from .common import InfoExtractor  from ..utils import (      ExtractorError, @@ -16,7 +19,7 @@ class DouyuTVIE(InfoExtractor):          'info_dict': {              'id': '17732',              'display_id': 'iseven', -            'ext': 'mp4', +            'ext': 'flv',              'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'description': r're:.*m7show@163\.com.*',              'thumbnail': r're:^https?://.*\.jpg$', @@ -31,7 +34,7 @@ class DouyuTVIE(InfoExtractor):          'info_dict': {              'id': '85982',              'display_id': '85982', -            'ext': 'mp4', +            'ext': 'flv',              'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'description': 'md5:746a2f7a253966a06755a912f0acc0d2',              'thumbnail': r're:^https?://.*\.jpg$', @@ -47,7 +50,7 @@ class DouyuTVIE(InfoExtractor):          'info_dict': {              'id': '17732',              'display_id': '17732', -            'ext': 'mp4', +            'ext': 'flv',              'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'description': r're:.*m7show@163\.com.*',              'thumbnail': r're:^https?://.*\.jpg$', @@ -66,10 +69,6 @@ class DouyuTVIE(InfoExtractor):          'only_matching': True,      }] -    # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf -    # is encrypted originally, but ffdec can dump memory to get the decrypted one. -    _API_KEY = 'A12Svb&%1UUmf@hC' -      def _real_extract(self, url):          video_id = self._match_id(url) @@ -80,6 +79,7 @@ class DouyuTVIE(InfoExtractor):              room_id = self._html_search_regex(                  r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') +        # Grab metadata from mobile API          room = self._download_json(              'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id,              note='Downloading room info')['data'] @@ -88,8 +88,19 @@ class DouyuTVIE(InfoExtractor):          if room.get('show_status') == '2':              raise ExtractorError('Live stream is offline', expected=True) -        formats = self._extract_m3u8_formats( -            room['hls_url'], video_id, ext='mp4') +        # Grab the URL from PC client API +        # The m3u8 url from mobile API requires re-authentication every 5 minutes +        tt = int(time.time()) +        signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt) +        sign = hashlib.md5(signContent.encode('ascii')).hexdigest() +        video_url = self._download_json( +            'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id, +            video_id, note='Downloading video URL info', +            query={'rate': 0}, headers={ +                'auth': sign, +                'time': str(tt), +                'aid': 'pcclient' +            })['data']['live_url']          title = self._live_title(unescapeHTML(room['room_name']))          description = room.get('show_details') @@ -99,7 +110,7 @@ class DouyuTVIE(InfoExtractor):          return {              'id': room_id,              'display_id': video_id, -            'formats': formats, +            'url': video_url,              'title': title,              'description': description,              'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 32028bc3b..87c5dd63e 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -6,37 +6,24 @@ import re  import time  from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( +    compat_urlparse, +    compat_HTTPError, +)  from ..utils import (      USER_AGENTS, +    ExtractorError,      int_or_none, +    unified_strdate, +    remove_end,      update_url_query,  )  class DPlayIE(InfoExtractor): -    _VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' +    _VALID_URL = r'https?://(?P<domain>www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)'      _TESTS = [{ -        # geo restricted, via direct unsigned hls URL -        'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', -        'info_dict': { -            'id': '1255600', -            'display_id': 'stagione-1-episodio-25', -            'ext': 'mp4', -            'title': 'Episodio 25', -            'description': 'md5:cae5f40ad988811b197d2d27a53227eb', -            'duration': 2761, -            'timestamp': 1454701800, -            'upload_date': '20160205', -            'creator': 'RTIT', -            'series': 'Take me out', -            'season_number': 1, -            'episode_number': 25, -            'age_limit': 0, -        }, -        'expected_warnings': ['Unable to download f4m manifest'], -    }, {          # non geo restricted, via secure api, unsigned download hls URL          'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',          'info_dict': { @@ -168,3 +155,90 @@ class DPlayIE(InfoExtractor):              'formats': formats,              'subtitles': subtitles,          } + + +class DPlayItIE(InfoExtractor): +    _VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)' +    _GEO_COUNTRIES = ['IT'] +    _TEST = { +        'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/', +        'md5': '2b808ffb00fc47b884a172ca5d13053c', +        'info_dict': { +            'id': '6918', +            'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij', +            'ext': 'mp4', +            'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij', +            'description': 'md5:3c7a4303aef85868f867a26f5cc14813', +            'thumbnail': r're:^https?://.*\.jpe?g', +            'upload_date': '20160524', +            'series': 'Biografie imbarazzanti', +            'season_number': 1, +            'episode': 'Luigi Di Maio: la psicosi di Stanislawskij', +            'episode_number': 1, +        }, +    } + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        info_url = self._search_regex( +            r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', +            webpage, 'video id') + +        title = remove_end(self._og_search_title(webpage), ' | Dplay') + +        try: +            info = self._download_json( +                info_url, display_id, headers={ +                    'Authorization': 'Bearer %s' % self._get_cookies(url).get( +                        'dplayit_token').value, +                    'Referer': url, +                }) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): +                info = self._parse_json(e.cause.read().decode('utf-8'), display_id) +                error = info['errors'][0] +                if error.get('code') == 'access.denied.geoblocked': +                    self.raise_geo_restricted( +                        msg=error.get('detail'), countries=self._GEO_COUNTRIES) +                raise ExtractorError(info['errors'][0]['detail'], expected=True) +            raise + +        hls_url = info['data']['attributes']['streaming']['hls']['url'] + +        formats = self._extract_m3u8_formats( +            hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', +            m3u8_id='hls') + +        series = self._html_search_regex( +            r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', +            webpage, 'series', fatal=False) +        episode = self._search_regex( +            r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)', +            webpage, 'episode', fatal=False) + +        mobj = re.search( +            r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})', +            webpage) +        if mobj: +            season_number = int(mobj.group('season_number')) +            episode_number = int(mobj.group('episode_number')) +            upload_date = unified_strdate(mobj.group('upload_date')) +        else: +            season_number = episode_number = upload_date = None + +        return { +            'id': info_url.rpartition('/')[-1], +            'display_id': display_id, +            'title': title, +            'description': self._og_search_description(webpage), +            'thumbnail': self._og_search_thumbnail(webpage), +            'series': series, +            'season_number': season_number, +            'episode': episode, +            'episode_number': episode_number, +            'upload_date': upload_date, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e966d7483..e4917014a 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -15,6 +15,8 @@ from ..utils import (  class DRTVIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' +    _GEO_BYPASS = False +    _GEO_COUNTRIES = ['DK']      IE_NAME = 'drtv'      _TESTS = [{          'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', @@ -137,7 +139,7 @@ class DRTVIE(InfoExtractor):          if not formats and restricted_to_denmark:              self.raise_geo_restricted(                  'Unfortunately, DR is not allowed to show this program outside Denmark.', -                expected=True) +                countries=self._GEO_COUNTRIES)          self._sort_formats(formats) @@ -156,6 +158,7 @@ class DRTVIE(InfoExtractor):  class DRTVLiveIE(InfoExtractor):      IE_NAME = 'drtv:live'      _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' +    _GEO_COUNTRIES = ['DK']      _TEST = {          'url': 'https://www.dr.dk/tv/live/dr1',          'info_dict': { diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1613a9d3..6a7028a4d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,7 @@ from .arte import (  )  from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE  from .audimedia import AudiMediaIE  from .audioboom import AudioBoomIE  from .audiomack import AudiomackIE, AudiomackAlbumIE @@ -117,6 +118,7 @@ from .bleacherreport import (  from .blinkx import BlinkxIE  from .bloomberg import BloombergIE  from .bokecc import BokeCCIE +from .bostonglobe import BostonGlobeIE  from .bpb import BpbIE  from .br import BRIE  from .bravotv import BravoTVIE @@ -246,7 +248,10 @@ from .dfb import DFBIE  from .dhm import DHMIE  from .dotsub import DotsubIE  from .douyutv import DouyuTVIE -from .dplay import DPlayIE +from .dplay import ( +    DPlayIE, +    DPlayItIE, +)  from .dramafever import (      DramaFeverIE,      DramaFeverSeriesIE, @@ -262,7 +267,11 @@ from .dvtv import DVTVIE  from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE -from .discoverygo import DiscoveryGoIE +from .discoverygo import ( +    DiscoveryGoIE, +    DiscoveryGoPlaylistIE, +) +from .discoverynetworks import DiscoveryNetworksDeIE  from .disney import DisneyIE  from .dispeak import DigitallySpeakingIE  from .dropbox import DropboxIE @@ -793,6 +802,7 @@ from .rai import (  )  from .rbmaradio import RBMARadioIE  from .rds import RDSIE +from .redbulltv import RedBullTVIE  from .redtube import RedTubeIE  from .regiotv import RegioTVIE  from .rentv import ( @@ -966,7 +976,6 @@ from .thisav import ThisAVIE  from .thisoldhouse import ThisOldHouseIE  from .threeqsdn import ThreeQSDNIE  from .tinypic import TinyPicIE -from .tlc import TlcDeIE  from .tmz import (      TMZIE,      TMZArticleIE, @@ -979,6 +988,7 @@ from .tnaflix import (  )  from .toggle import ToggleIE  from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE  from .toutv import TouTvIE  from .toypics import ToypicsUserIE, ToypicsIE  from .traileraddict import TrailerAddictIE @@ -999,6 +1009,7 @@ from .tunein import (      TuneInTopicIE,      TuneInShortenerIE,  ) +from .tunepk import TunePkIE  from .turbo import TurboIE  from .tutv import TutvIE  from .tv2 import ( @@ -1165,6 +1176,8 @@ from .voicerepublic import VoiceRepublicIE  from .voxmedia import VoxMediaIE  from .vporn import VpornIE  from .vrt import VRTIE +from .vrak import VrakIE +from .medialaan import MedialaanIE  from .vube import VubeIE  from .vuclip import VuClipIE  from .vvvvid import VVVVIDIE diff --git a/youtube_dl/extractor/eyedotv.py b/youtube_dl/extractor/eyedotv.py index 2f3035147..f62ddebae 100644 --- a/youtube_dl/extractor/eyedotv.py +++ b/youtube_dl/extractor/eyedotv.py @@ -54,7 +54,7 @@ class EyedoTVIE(InfoExtractor):              'id': video_id,              'title': title,              'formats': self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native'), +                m3u8_url, video_id, 'mp4', 'm3u8_native'),              'description': xpath_text(video_data, _add_ns('Description')),              'duration': parse_duration(xpath_text(video_data, _add_ns('Duration'))),              'uploader': xpath_text(video_data, _add_ns('Createur')), diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 70b8c95c5..b69c1ede0 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -196,6 +196,10 @@ class FacebookIE(InfoExtractor):      }, {          'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',          'only_matching': True, +    }, { +        # no title +        'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', +        'only_matching': True,      }]      @staticmethod @@ -303,7 +307,7 @@ class FacebookIE(InfoExtractor):          if not video_data:              server_js_data = self._parse_json(                  self._search_regex( -                    r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall)', +                    r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)',                      webpage, 'js data', default='{}'),                  video_id, transform_source=js_to_json, fatal=False)              if server_js_data: @@ -353,15 +357,15 @@ class FacebookIE(InfoExtractor):          self._sort_formats(formats)          video_title = self._html_search_regex( -            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title', -            default=None) +            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, +            'title', default=None)          if not video_title:              video_title = self._html_search_regex(                  r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',                  webpage, 'alternative title', default=None)          if not video_title:              video_title = self._html_search_meta( -                'description', webpage, 'title') +                'description', webpage, 'title', default=None)          if video_title:              video_title = limit_length(video_title, 80)          else: diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 9f2e5d065..159fdf9c4 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -47,9 +47,12 @@ class FOXIE(AdobePassIE):              resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating)              query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) -        return { +        info = self._search_json_ld(webpage, video_id, fatal=False) +        info.update({              '_type': 'url_transparent',              'ie_key': 'ThePlatform',              'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),              'id': video_id, -        } +        }) + +        return info diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index b98da692c..b8fa17588 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -4,7 +4,8 @@ from __future__ import unicode_literals  from .common import InfoExtractor  from ..utils import (      determine_ext, -    unified_strdate, +    extract_attributes, +    int_or_none,  ) @@ -19,6 +20,7 @@ class FranceCultureIE(InfoExtractor):              'title': 'Rendez-vous au pays des geeks',              'thumbnail': r're:^https?://.*\.jpg$',              'upload_date': '20140301', +            'timestamp': 1393642916,              'vcodec': 'none',          }      } @@ -28,30 +30,34 @@ class FranceCultureIE(InfoExtractor):          webpage = self._download_webpage(url, display_id) -        video_url = self._search_regex( -            r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<button[^>]+data-asset-source="([^"]+)"', -            webpage, 'video path') +        video_data = extract_attributes(self._search_regex( +            r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)', +            webpage, 'video data')) -        title = self._og_search_title(webpage) +        video_url = video_data['data-asset-source'] +        title = video_data.get('data-asset-title') or self._og_search_title(webpage) -        upload_date = unified_strdate(self._search_regex( -            '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<', -            webpage, 'upload date', fatal=False)) +        description = self._html_search_regex( +            r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', +            webpage, 'description', default=None)          thumbnail = self._search_regex( -            r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-dejavu-src="([^"]+)"', +            r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"',              webpage, 'thumbnail', fatal=False)          uploader = self._html_search_regex( -            r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', +            r'(?s)<span class="author">(.*?)</span>',              webpage, 'uploader', default=None) -        vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None +        ext = determine_ext(video_url.lower())          return {              'id': display_id,              'display_id': display_id,              'url': video_url,              'title': title, +            'description': description,              'thumbnail': thumbnail, -            'vcodec': vcodec, +            'ext': ext, +            'vcodec': 'none' if ext == 'mp3' else None,              'uploader': uploader, -            'upload_date': upload_date, +            'timestamp': int_or_none(video_data.get('data-asset-created-date')), +            'duration': int_or_none(video_data.get('data-duration')),          } diff --git a/youtube_dl/extractor/freshlive.py b/youtube_dl/extractor/freshlive.py index a90f9156c..72a845945 100644 --- a/youtube_dl/extractor/freshlive.py +++ b/youtube_dl/extractor/freshlive.py @@ -56,9 +56,8 @@ class FreshLiveIE(InfoExtractor):          is_live = info.get('liveStreamUrl') is not None          formats = self._extract_m3u8_formats( -            stream_url, video_id, ext='mp4', -            entry_protocol='m3u8' if is_live else 'm3u8_native', -            m3u8_id='hls') +            stream_url, video_id, 'mp4', +            'm3u8_native', m3u8_id='hls')          if is_live:              title = self._live_title(title) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3fe0237b6..274f81738 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -84,6 +84,7 @@ from .twentymin import TwentyMinutenIE  from .ustream import UstreamIE  from .openload import OpenloadIE  from .videopress import VideoPressIE +from .rutube import RutubeIE  class GenericIE(InfoExtractor): @@ -448,6 +449,23 @@ class GenericIE(InfoExtractor):                  },              }],          }, +        { +            # Brightcove with UUID in videoPlayer +            'url': 'http://www8.hp.com/cn/zh/home.html', +            'info_dict': { +                'id': '5255815316001', +                'ext': 'mp4', +                'title': 'Sprocket Video - China', +                'description': 'Sprocket Video - China', +                'uploader': 'HP-Video Gallery', +                'timestamp': 1482263210, +                'upload_date': '20161220', +                'uploader_id': '1107601872001', +            }, +            'params': { +                'skip_download': True,  # m3u8 download +            }, +        },          # ooyala video          {              'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -884,12 +902,13 @@ class GenericIE(InfoExtractor):          },          # LazyYT          { -            'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', +            'url': 'https://skiplagged.com/',              'info_dict': { -                'id': '1986', -                'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', +                'id': 'skiplagged', +                'title': 'Skiplagged: The smart way to find cheap flights',              }, -            'playlist_mincount': 2, +            'playlist_mincount': 1, +            'add_ie': ['Youtube'],          },          # Cinchcast embed          { @@ -1517,10 +1536,38 @@ class GenericIE(InfoExtractor):              'add_ie': [VideoPressIE.ie_key()],          },          { +            # Rutube embed +            'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', +            'info_dict': { +                'id': '9b3d5bee0a8740bf70dfd29d3ea43541', +                'ext': 'flv', +                'title': 'Магаззино: Казань 2', +                'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', +                'uploader': 'Магаззино', +                'upload_date': '20170228', +                'uploader_id': '996642', +            }, +            'params': { +                'skip_download': True, +            }, +            'add_ie': [RutubeIE.ie_key()], +        }, +        {              # ThePlatform embedded with whitespaces in URLs              'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm',              'only_matching': True,          }, +        { +            # Senate ISVP iframe https +            'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', +            'md5': 'fb8c70b0b515e5037981a2492099aab8', +            'info_dict': { +                'id': 'govtaff020316', +                'ext': 'mp4', +                'title': 'Integrated Senate Video Player', +            }, +            'add_ie': [SenateISVPIE.ie_key()], +        },          # {          #     # TODO: find another test          #     # http://schema.org/VideoObject @@ -1820,14 +1867,6 @@ class GenericIE(InfoExtractor):          video_description = self._og_search_description(webpage, default=None)          video_thumbnail = self._og_search_thumbnail(webpage, default=None) -        # Helper method -        def _playlist_from_matches(matches, getter=None, ie=None): -            urlrs = orderedSet( -                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) -                for m in matches) -            return self.playlist_result( -                urlrs, playlist_id=video_id, playlist_title=video_title) -          # Look for Brightcove Legacy Studio embeds          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)          if bc_urls: @@ -1848,28 +1887,28 @@ class GenericIE(InfoExtractor):          # Look for Brightcove New Studio embeds          bc_urls = BrightcoveNewIE._extract_urls(webpage)          if bc_urls: -            return _playlist_from_matches(bc_urls, ie='BrightcoveNew') +            return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')          # Look for ThePlatform embeds          tp_urls = ThePlatformIE._extract_urls(webpage)          if tp_urls: -            return _playlist_from_matches(tp_urls, ie='ThePlatform') +            return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')          # Look for Vessel embeds          vessel_urls = VesselIE._extract_urls(webpage)          if vessel_urls: -            return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) +            return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key())          # Look for embedded rtl.nl player          matches = re.findall(              r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',              webpage)          if matches: -            return _playlist_from_matches(matches, ie='RtlNl') +            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')          vimeo_urls = VimeoIE._extract_urls(url, webpage)          if vimeo_urls: -            return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) +            return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())          vid_me_embed_url = self._search_regex(              r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1891,25 +1930,25 @@ class GenericIE(InfoExtractor):                  (?:embed|v|p)/.+?)              \1''', webpage)          if matches: -            return _playlist_from_matches( -                matches, lambda m: unescapeHTML(m[1])) +            return self.playlist_from_matches( +                matches, video_id, video_title, lambda m: unescapeHTML(m[1]))          # Look for lazyYT YouTube embed          matches = re.findall(              r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)          if matches: -            return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) +            return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))          # Look for Wordpress "YouTube Video Importer" plugin          matches = re.findall(r'''(?x)<div[^>]+              class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+              data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)          if matches: -            return _playlist_from_matches(matches, lambda m: m[-1]) +            return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])          matches = DailymotionIE._extract_urls(webpage)          if matches: -            return _playlist_from_matches(matches) +            return self.playlist_from_matches(matches, video_id, video_title)          # Look for embedded Dailymotion playlist player (#3822)          m = re.search( @@ -1918,8 +1957,8 @@ class GenericIE(InfoExtractor):              playlists = re.findall(                  r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))              if playlists: -                return _playlist_from_matches( -                    playlists, lambda p: '//dailymotion.com/playlist/%s' % p) +                return self.playlist_from_matches( +                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)          # Look for embedded Wistia player          match = re.search( @@ -2026,8 +2065,9 @@ class GenericIE(InfoExtractor):          if mobj is not None:              embeds = self._parse_json(mobj.group(1), video_id, fatal=False)              if embeds: -                return _playlist_from_matches( -                    embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') +                return self.playlist_from_matches( +                    embeds, video_id, video_title, +                    getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala')          # Look for Aparat videos          mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -2089,13 +2129,13 @@ class GenericIE(InfoExtractor):          # Look for funnyordie embed          matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)          if matches: -            return _playlist_from_matches( -                matches, getter=unescapeHTML, ie='FunnyOrDie') +            return self.playlist_from_matches( +                matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')          # Look for BBC iPlayer embed          matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)          if matches: -            return _playlist_from_matches(matches, ie='BBCCoUk') +            return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk')          # Look for embedded RUTV player          rutv_url = RUTVIE._extract_url(webpage) @@ -2110,32 +2150,32 @@ class GenericIE(InfoExtractor):          # Look for embedded SportBox player          sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)          if sportbox_urls: -            return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') +            return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')          # Look for embedded XHamster player          xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)          if xhamster_urls: -            return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') +            return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed')          # Look for embedded TNAFlixNetwork player          tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage)          if tnaflix_urls: -            return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) +            return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key())          # Look for embedded PornHub player          pornhub_urls = PornHubIE._extract_urls(webpage)          if pornhub_urls: -            return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) +            return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key())          # Look for embedded DrTuber player          drtuber_urls = DrTuberIE._extract_urls(webpage)          if drtuber_urls: -            return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) +            return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key())          # Look for embedded RedTube player          redtube_urls = RedTubeIE._extract_urls(webpage)          if redtube_urls: -            return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) +            return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key())          # Look for embedded Tvigle player          mobj = re.search( @@ -2181,12 +2221,12 @@ class GenericIE(InfoExtractor):          # Look for embedded soundcloud player          soundcloud_urls = SoundcloudIE._extract_urls(webpage)          if soundcloud_urls: -            return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) +            return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())          # Look for tunein player          tunein_urls = TuneInBaseIE._extract_urls(webpage)          if tunein_urls: -            return _playlist_from_matches(tunein_urls) +            return self.playlist_from_matches(tunein_urls, video_id, video_title)          # Look for embedded mtvservices player          mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2469,30 +2509,36 @@ class GenericIE(InfoExtractor):          # Look for DBTV embeds          dbtv_urls = DBTVIE._extract_urls(webpage)          if dbtv_urls: -            return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) +            return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key())          # Look for Videa embeds          videa_urls = VideaIE._extract_urls(webpage)          if videa_urls: -            return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) +            return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key())          # Look for 20 minuten embeds          twentymin_urls = TwentyMinutenIE._extract_urls(webpage)          if twentymin_urls: -            return _playlist_from_matches( -                twentymin_urls, ie=TwentyMinutenIE.ie_key()) +            return self.playlist_from_matches( +                twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())          # Look for Openload embeds          openload_urls = OpenloadIE._extract_urls(webpage)          if openload_urls: -            return _playlist_from_matches( -                openload_urls, ie=OpenloadIE.ie_key()) +            return self.playlist_from_matches( +                openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())          # Look for VideoPress embeds          videopress_urls = VideoPressIE._extract_urls(webpage)          if videopress_urls: -            return _playlist_from_matches( -                videopress_urls, ie=VideoPressIE.ie_key()) +            return self.playlist_from_matches( +                videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) + +        # Look for Rutube embeds +        rutube_urls = RutubeIE._extract_urls(webpage) +        if rutube_urls: +            return self.playlist_from_matches( +                rutube_urls, ie=RutubeIE.ie_key())          # Looking for http://schema.org/VideoObject          json_ld = self._search_json_ld( @@ -2521,7 +2567,11 @@ class GenericIE(InfoExtractor):          jwplayer_data = self._find_jwplayer_data(              webpage, video_id, transform_source=js_to_json)          if jwplayer_data: -            return self._parse_jwplayer_data(jwplayer_data, video_id) +            info = self._parse_jwplayer_data( +                jwplayer_data, video_id, require_title=False) +            if not info.get('title'): +                info['title'] = video_title +            return info          def check_video(vurl):              if YoutubeIE.suitable(vurl): @@ -2596,11 +2646,14 @@ class GenericIE(InfoExtractor):                      found = re.search(REDIRECT_REGEX, refresh_header)              if found:                  new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) -                self.report_following_redirect(new_url) -                return { -                    '_type': 'url', -                    'url': new_url, -                } +                if new_url != url: +                    self.report_following_redirect(new_url) +                    return { +                        '_type': 'url', +                        'url': new_url, +                    } +                else: +                    found = None          if not found:              # twitter:player is a https URL to iframe player that may or may not diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 21ed846b2..4c9be47b4 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,7 +36,7 @@ class GoIE(AdobePassIE):              'requestor_id': 'DisneyXD',          }      } -    _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) +    _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())      _TESTS = [{          'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',          'info_dict': { @@ -52,6 +52,12 @@ class GoIE(AdobePassIE):      }, {          'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',          'only_matching': True, +    }, { +        'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', +        'only_matching': True, +    }, { +        'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index 8116ad9bd..931f71a5a 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      xpath_text,      xpath_element, @@ -14,14 +15,26 @@ from ..utils import (  class HBOBaseIE(InfoExtractor):      _FORMATS_INFO = { +        'pro7': { +            'width': 1280, +            'height': 720, +        },          '1920': {              'width': 1280,              'height': 720,          }, +        'pro6': { +            'width': 768, +            'height': 432, +        },          '640': {              'width': 768,              'height': 432,          }, +        'pro5': { +            'width': 640, +            'height': 360, +        },          'highwifi': {              'width': 640,              'height': 360, @@ -78,6 +91,17 @@ class HBOBaseIE(InfoExtractor):                      formats.extend(self._extract_m3u8_formats(                          video_url.replace('.tar', '/base_index_w8.m3u8'),                          video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +                elif source.tag == 'hls': +                    # #EXT-X-BYTERANGE is not supported by native hls downloader +                    # and ffmpeg (#10955) +                    # formats.extend(self._extract_m3u8_formats( +                    #     video_url.replace('.tar', '/base_index.m3u8'), +                    #     video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +                    continue +                elif source.tag == 'dash': +                    formats.extend(self._extract_mpd_formats( +                        video_url.replace('.tar', '/manifest.mpd'), +                        video_id, mpd_id='dash', fatal=False))                  else:                      format_info = self._FORMATS_INFO.get(source.tag, {})                      formats.append({ @@ -112,10 +136,11 @@ class HBOBaseIE(InfoExtractor):  class HBOIE(HBOBaseIE): +    IE_NAME = 'hbo'      _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)'      _TEST = {          'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', -        'md5': '1c33253f0c7782142c993c0ba62a8753', +        'md5': '2c6a6bc1222c7e91cb3334dad1746e5a',          'info_dict': {              'id': '1437839',              'ext': 'mp4', @@ -131,11 +156,12 @@ class HBOIE(HBOBaseIE):  class HBOEpisodeIE(HBOBaseIE): -    _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?!video)([^/]+/)+video/(?P<id>[0-9a-z-]+)\.html' +    IE_NAME = 'hbo:episode' +    _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P<path>(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P<id>[0-9a-z-]+))(?:\.html)?'      _TESTS = [{          'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true', -        'md5': '689132b253cc0ab7434237fc3a293210', +        'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb',          'info_dict': {              'id': '1439518',              'display_id': 'ep-52-inside-the-episode', @@ -147,16 +173,19 @@ class HBOEpisodeIE(HBOBaseIE):      }, {          'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true',          'only_matching': True, +    }, { +        'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver', +        'only_matching': True,      }]      def _real_extract(self, url): -        display_id = self._match_id(url) +        path, display_id = re.match(self._VALID_URL, url).groups() -        webpage = self._download_webpage(url, display_id) +        content = self._download_json( +            'http://www.hbo.com/api/content/' + path, display_id)['content'] -        video_id = self._search_regex( -            r'(?P<q1>[\'"])videoId(?P=q1)\s*:\s*(?P<q2>[\'"])(?P<video_id>\d+)(?P=q2)', -            webpage, 'video ID', group='video_id') +        video_id = compat_str((content.get('parsed', {}).get( +            'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId'])          info_dict = self._extract_from_id(video_id)          info_dict['display_id'] = display_id diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c863413bf..7f946c6ed 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -119,7 +119,8 @@ class LivestreamIE(InfoExtractor):          m3u8_url = video_data.get('m3u8_url')          if m3u8_url:              formats.extend(self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +                m3u8_url, video_id, 'mp4', 'm3u8_native', +                m3u8_id='hls', fatal=False))          f4m_url = video_data.get('f4m_url')          if f4m_url: @@ -158,11 +159,11 @@ class LivestreamIE(InfoExtractor):          if smil_url:              formats.extend(self._extract_smil_formats(smil_url, broadcast_id)) -        entry_protocol = 'm3u8' if is_live else 'm3u8_native'          m3u8_url = stream_info.get('m3u8_url')          if m3u8_url:              formats.extend(self._extract_m3u8_formats( -                m3u8_url, broadcast_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False)) +                m3u8_url, broadcast_id, 'mp4', 'm3u8_native', +                m3u8_id='hls', fatal=False))          rtsp_url = stream_info.get('rtsp_url')          if rtsp_url: @@ -276,7 +277,7 @@ class LivestreamOriginalIE(InfoExtractor):              'view_count': view_count,          } -    def _extract_video_formats(self, video_data, video_id, entry_protocol): +    def _extract_video_formats(self, video_data, video_id):          formats = []          progressive_url = video_data.get('progressiveUrl') @@ -289,7 +290,8 @@ class LivestreamOriginalIE(InfoExtractor):          m3u8_url = video_data.get('httpUrl')          if m3u8_url:              formats.extend(self._extract_m3u8_formats( -                m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=False)) +                m3u8_url, video_id, 'mp4', 'm3u8_native', +                m3u8_id='hls', fatal=False))          rtsp_url = video_data.get('rtspUrl')          if rtsp_url: @@ -340,11 +342,10 @@ class LivestreamOriginalIE(InfoExtractor):                  }              video_data = self._download_json(stream_url, content_id)              is_live = video_data.get('isLive') -            entry_protocol = 'm3u8' if is_live else 'm3u8_native'              info.update({                  'id': content_id,                  'title': self._live_title(info['title']) if is_live else info['title'], -                'formats': self._extract_video_formats(video_data, content_id, entry_protocol), +                'formats': self._extract_video_formats(video_data, content_id),                  'is_live': is_live,              })              return info diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py new file mode 100644 index 000000000..6e067474b --- /dev/null +++ b/youtube_dl/extractor/medialaan.py @@ -0,0 +1,259 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_duration, +    try_get, +    unified_timestamp, +    urlencode_postdata, +) + + +class MedialaanIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?:www\.)? +                        (?: +                            (?P<site_id>vtm|q2|vtmkzoom)\.be/ +                            (?: +                                video(?:/[^/]+/id/|/?\?.*?\baid=)| +                                (?:[^/]+/)* +                            ) +                        ) +                        (?P<id>[^/?#&]+) +                    ''' +    _NETRC_MACHINE = 'medialaan' +    _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' +    _SITE_TO_APP_ID = { +        'vtm': 'vtm_watch', +        'q2': 'q2', +        'vtmkzoom': 'vtmkzoom', +    } +    _TESTS = [{ +        # vod +        'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', +        'info_dict': { +            'id': 'vtm_20170219_VM0678361_vtmwatch', +            'ext': 'mp4', +            'title': 'Allemaal Chris afl. 6', +            'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', +            'timestamp': 1487533280, +            'upload_date': '20170219', +            'duration': 2562, +            'series': 'Allemaal Chris', +            'season': 'Allemaal Chris', +            'season_number': 1, +            'season_id': '256936078124527', +            'episode': 'Allemaal Chris afl. 6', +            'episode_number': 6, +            'episode_id': '256936078591527', +        }, +        'params': { +            'skip_download': True, +        }, +        'skip': 'Requires account credentials', +    }, { +        # clip +        'url': 'http://vtm.be/video?aid=168332', +        'info_dict': { +            'id': '168332', +            'ext': 'mp4', +            'title': '"Veronique liegt!"', +            'description': 'md5:1385e2b743923afe54ba4adc38476155', +            'timestamp': 1489002029, +            'upload_date': '20170308', +            'duration': 96, +        }, +    }, { +        # vod +        'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', +        'only_matching': True, +    }, { +        # vod +        'url': 'http://vtm.be/video?aid=163157', +        'only_matching': True, +    }, { +        # vod +        'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', +        'only_matching': True, +    }, { +        # clip +        'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', +        'only_matching': True, +    }] + +    def _real_initialize(self): +        self._logged_in = False + +    def _login(self): +        username, password = self._get_login_info() +        if username is None: +            self.raise_login_required() + +        auth_data = { +            'APIKey': self._APIKEY, +            'sdk': 'js_6.1', +            'format': 'json', +            'loginID': username, +            'password': password, +        } + +        auth_info = self._download_json( +            'https://accounts.eu1.gigya.com/accounts.login', None, +            note='Logging in', errnote='Unable to log in', +            data=urlencode_postdata(auth_data)) + +        error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') +        if error_message: +            raise ExtractorError( +                'Unable to login: %s' % error_message, expected=True) + +        self._uid = auth_info['UID'] +        self._uid_signature = auth_info['UIDSignature'] +        self._signature_timestamp = auth_info['signatureTimestamp'] + +        self._logged_in = True + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id, site_id = mobj.group('id', 'site_id') + +        webpage = self._download_webpage(url, video_id) + +        config = self._parse_json( +            self._search_regex( +                r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', +                webpage, 'config', default='{}'), video_id, +            transform_source=lambda s: s.replace( +                '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) + +        vod_id = config.get('vodId') or self._search_regex( +            (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', +             r'<[^>]+id=["\']vod-(\d+)'), +            webpage, 'video_id', default=None) + +        # clip, no authentication required +        if not vod_id: +            player = self._parse_json( +                self._search_regex( +                    r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', +                    default=''), +                video_id, transform_source=lambda s: '[%s]' % s, fatal=False) +            if player: +                video = player[-1] +                info = { +                    'id': video_id, +                    'url': video['videoUrl'], +                    'title': video['title'], +                    'thumbnail': video.get('imageUrl'), +                    'timestamp': int_or_none(video.get('createdDate')), +                    'duration': int_or_none(video.get('duration')), +                } +            else: +                info = self._parse_html5_media_entries( +                    url, webpage, video_id, m3u8_id='hls')[0] +                info.update({ +                    'id': video_id, +                    'title': self._html_search_meta('description', webpage), +                    'duration': parse_duration(self._html_search_meta('duration', webpage)), +                }) +        # vod, authentication required +        else: +            if not self._logged_in: +                self._login() + +            settings = self._parse_json( +                self._search_regex( +                    r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', +                    webpage, 'drupal settings', default='{}'), +                video_id) + +            def get(container, item): +                return try_get( +                    settings, lambda x: x[container][item], +                    compat_str) or self._search_regex( +                    r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, +                    default=None) + +            app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') +            sso = get('vod', 'gigyaDatabase') or 'vtm-sso' + +            data = self._download_json( +                'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, +                video_id, query={ +                    'app_id': app_id, +                    'user_network': sso, +                    'UID': self._uid, +                    'UIDSignature': self._uid_signature, +                    'signatureTimestamp': self._signature_timestamp, +                }) + +            formats = self._extract_m3u8_formats( +                data['response']['uri'], video_id, entry_protocol='m3u8_native', +                ext='mp4', m3u8_id='hls') + +            self._sort_formats(formats) + +            info = { +                'id': vod_id, +                'formats': formats, +            } + +            api_key = get('vod', 'apiKey') +            channel = get('medialaanGigya', 'channel') + +            if api_key: +                videos = self._download_json( +                    'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, +                    query={ +                        'channels': channel, +                        'ids': vod_id, +                        'limit': 1, +                        'apikey': api_key, +                    }) +                if videos: +                    video = try_get( +                        videos, lambda x: x['response']['videos'][0], dict) +                    if video: +                        def get(container, item, expected_type=None): +                            return try_get( +                                video, lambda x: x[container][item], expected_type) + +                        def get_string(container, item): +                            return get(container, item, compat_str) + +                        info.update({ +                            'series': get_string('program', 'title'), +                            'season': get_string('season', 'title'), +                            'season_number': int_or_none(get('season', 'number')), +                            'season_id': get_string('season', 'id'), +                            'episode': get_string('episode', 'title'), +                            'episode_number': int_or_none(get('episode', 'number')), +                            'episode_id': get_string('episode', 'id'), +                            'duration': int_or_none( +                                video.get('duration')) or int_or_none( +                                video.get('durationMillis'), scale=1000), +                            'title': get_string('episode', 'title'), +                            'description': get_string('episode', 'text'), +                            'timestamp': unified_timestamp(get_string( +                                'publication', 'begin')), +                        }) + +            if not info.get('title'): +                info['title'] = try_get( +                    config, lambda x: x['videoConfig']['title'], +                    compat_str) or self._html_search_regex( +                    r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', +                    default=None) or self._og_search_title(webpage) + +        if not info.get('description'): +            info['description'] = self._html_search_regex( +                r'<div[^>]+class="field-item\s+even">\s*<p>(.+?)</p>', +                webpage, 'description', default=None) + +        return info diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index ec1b4c4fe..40f72d66f 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -51,6 +51,7 @@ class MioMioIE(InfoExtractor):              'ext': 'mp4',              'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31',          }, +        'skip': 'Unable to load videos',      }]      def _extract_mioplayer(self, webpage, video_id, title, http_headers): @@ -94,9 +95,18 @@ class MioMioIE(InfoExtractor):          return entries +    def _download_chinese_webpage(self, *args, **kwargs): +        # Requests with English locales return garbage +        headers = { +            'Accept-Language': 'zh-TW,en-US;q=0.7,en;q=0.3', +        } +        kwargs.setdefault('headers', {}).update(headers) +        return self._download_webpage(*args, **kwargs) +      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) +        webpage = self._download_chinese_webpage( +            url, video_id)          title = self._html_search_meta(              'description', webpage, 'title', fatal=True) @@ -106,7 +116,7 @@ class MioMioIE(InfoExtractor):          if '_h5' in mioplayer_path:              player_url = compat_urlparse.urljoin(url, mioplayer_path) -            player_webpage = self._download_webpage( +            player_webpage = self._download_chinese_webpage(                  player_url, video_id,                  note='Downloading player webpage', headers={'Referer': url})              entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 79e0b8ada..28b743cca 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals  import uuid  from .common import InfoExtractor +from .ooyala import OoyalaIE  from ..compat import (      compat_str,      compat_urllib_parse_urlencode, @@ -24,6 +25,9 @@ class MiTeleBaseIE(InfoExtractor):              r'(?s)(<ms-video-player.+?</ms-video-player>)',              webpage, 'ms video player'))          video_id = player_data['data-media-id'] +        if player_data.get('data-cms-id') == 'ooyala': +            return self.url_result( +                'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id)          config_url = compat_urlparse.urljoin(url, player_data['data-config'])          config = self._download_json(              config_url, video_id, 'Downloading config JSON') diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index d9943fc2c..8961309fd 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -34,12 +34,6 @@ class NineCNineMediaStackIE(NineCNineMediaBaseIE):          formats.extend(self._extract_f4m_formats(              stack_base_url + 'f4m', stack_id,              f4m_id='hds', fatal=False)) -        mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False) -        if mp4_url: -            formats.append({ -                'url': mp4_url, -                'format_id': 'mp4', -            })          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 50473d777..38fefe492 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,41 +3,27 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( +    compat_HTTPError, +    compat_str, +)  from ..utils import ( +    determine_ext, +    ExtractorError,      fix_xml_ampersands,      orderedSet,      parse_duration,      qualities,      strip_jsonp,      unified_strdate, -    ExtractorError,  )  class NPOBaseIE(InfoExtractor):      def _get_token(self, video_id): -        token_page = self._download_webpage( -            'http://ida.omroep.nl/npoplayer/i.js', -            video_id, note='Downloading token') -        token = self._search_regex( -            r'npoplayer\.token = "(.+?)"', token_page, 'token') -        # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js -        token_l = list(token) -        first = second = None -        for i in range(5, len(token_l) - 4): -            if token_l[i].isdigit(): -                if first is None: -                    first = i -                elif second is None: -                    second = i -        if first is None or second is None: -            first = 12 -            second = 13 - -        token_l[first], token_l[second] = token_l[second], token_l[first] - -        return ''.join(token_l) +        return self._download_json( +            'http://ida.omroep.nl/app.php/auth', video_id, +            note='Downloading token')['token']  class NPOIE(NPOBaseIE): @@ -58,103 +44,113 @@ class NPOIE(NPOBaseIE):                          (?P<id>[^/?#]+)                  ''' -    _TESTS = [ -        { -            'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', -            'md5': '4b3f9c429157ec4775f2c9cb7b911016', -            'info_dict': { -                'id': 'VPWON_1220719', -                'ext': 'm4v', -                'title': 'Nieuwsuur', -                'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', -                'upload_date': '20140622', -            }, +    _TESTS = [{ +        'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', +        'md5': '4b3f9c429157ec4775f2c9cb7b911016', +        'info_dict': { +            'id': 'VPWON_1220719', +            'ext': 'm4v', +            'title': 'Nieuwsuur', +            'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', +            'upload_date': '20140622',          }, -        { -            'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', -            'md5': 'da50a5787dbfc1603c4ad80f31c5120b', -            'info_dict': { -                'id': 'VARA_101191800', -                'ext': 'm4v', -                'title': 'De Mega Mike & Mega Thomas show: The best of.', -                'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', -                'upload_date': '20090227', -                'duration': 2400, -            }, +    }, { +        'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', +        'md5': 'da50a5787dbfc1603c4ad80f31c5120b', +        'info_dict': { +            'id': 'VARA_101191800', +            'ext': 'm4v', +            'title': 'De Mega Mike & Mega Thomas show: The best of.', +            'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', +            'upload_date': '20090227', +            'duration': 2400,          }, -        { -            'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', -            'md5': 'f8065e4e5a7824068ed3c7e783178f2c', -            'info_dict': { -                'id': 'VPWON_1169289', -                'ext': 'm4v', -                'title': 'Tegenlicht: De toekomst komt uit Afrika', -                'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', -                'upload_date': '20130225', -                'duration': 3000, -            }, +    }, { +        'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', +        'md5': 'f8065e4e5a7824068ed3c7e783178f2c', +        'info_dict': { +            'id': 'VPWON_1169289', +            'ext': 'm4v', +            'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', +            'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', +            'upload_date': '20130225', +            'duration': 3000,          }, -        { -            'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', -            'info_dict': { -                'id': 'WO_VPRO_043706', -                'ext': 'wmv', -                'title': 'De nieuwe mens - Deel 1', -                'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', -                'duration': 4680, -            }, -            'params': { -                # mplayer mms download -                'skip_download': True, -            } +    }, { +        'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', +        'info_dict': { +            'id': 'WO_VPRO_043706', +            'ext': 'm4v', +            'title': 'De nieuwe mens - Deel 1', +            'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', +            'duration': 4680,          }, +        'params': { +            'skip_download': True, +        } +    }, {          # non asf in streams -        { -            'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', -            'md5': 'b3da13de374cbe2d5332a7e910bef97f', -            'info_dict': { -                'id': 'WO_NOS_762771', -                'ext': 'mp4', -                'title': 'Hoe gaat Europa verder na Parijs?', -            }, -        }, -        { -            'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', -            'md5': '01c6a2841675995da1f0cf776f03a9c3', -            'info_dict': { -                'id': 'VPWON_1233944', -                'ext': 'm4v', -                'title': 'Aap, poot, pies', -                'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', -                'upload_date': '20150508', -                'duration': 599, -            }, +        'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', +        'info_dict': { +            'id': 'WO_NOS_762771', +            'ext': 'mp4', +            'title': 'Hoe gaat Europa verder na Parijs?',          }, -        { -            'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', -            'md5': 'd30cd8417b8b9bca1fdff27428860d08', -            'info_dict': { -                'id': 'POW_00996502', -                'ext': 'm4v', -                'title': '''"Dit is wel een 'landslide'..."''', -                'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', -                'upload_date': '20150508', -                'duration': 462, -            }, +        'params': { +            'skip_download': True, +        } +    }, { +        'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', +        'info_dict': { +            'id': 'VPWON_1233944', +            'ext': 'm4v', +            'title': 'Aap, poot, pies', +            'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', +            'upload_date': '20150508', +            'duration': 599,          }, -        { -            'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', -            'only_matching': True, +        'params': { +            'skip_download': True, +        } +    }, { +        'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', +        'info_dict': { +            'id': 'POW_00996502', +            'ext': 'm4v', +            'title': '''"Dit is wel een 'landslide'..."''', +            'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', +            'upload_date': '20150508', +            'duration': 462,          }, -        { -            'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', -            'only_matching': True, +        'params': { +            'skip_download': True, +        } +    }, { +        # audio +        'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', +        'info_dict': { +            'id': 'RBX_FUNX_6683215', +            'ext': 'mp3', +            'title': 'Jouw Stad Rotterdam', +            'description': 'md5:db251505244f097717ec59fabc372d9f',          }, -        { -            'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', -            'only_matching': True, +        'params': { +            'skip_download': True,          } -    ] +    }, { +        'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', +        'only_matching': True, +    }, { +        'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', +        'only_matching': True, +    }, { +        'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', +        'only_matching': True, +    }, { +        # live stream +        'url': 'npo:LI_NL1_4188102', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -183,70 +179,115 @@ class NPOIE(NPOBaseIE):          token = self._get_token(video_id)          formats = [] +        urls = set() + +        quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) +        items = self._download_json( +            'http://ida.omroep.nl/app.php/%s' % video_id, video_id, +            'Downloading formats JSON', query={ +                'adaptive': 'yes', +                'token': token, +            })['items'][0] +        for num, item in enumerate(items): +            item_url = item.get('url') +            if not item_url or item_url in urls: +                continue +            urls.add(item_url) +            format_id = self._search_regex( +                r'video/ida/([^/]+)', item_url, 'format id', +                default=None) + +            def add_format_url(format_url): +                formats.append({ +                    'url': format_url, +                    'format_id': format_id, +                    'quality': quality(format_id), +                }) + +            # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 +            if item.get('contentType') in ('url', 'audio'): +                add_format_url(item_url) +                continue -        pubopties = metadata.get('pubopties') -        if pubopties: -            quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) -            for format_id in pubopties: -                format_info = self._download_json( -                    'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' -                    % (video_id, format_id, token), -                    video_id, 'Downloading %s JSON' % format_id) -                if format_info.get('error_code', 0) or format_info.get('errorcode', 0): +            try: +                stream_info = self._download_json( +                    item_url + '&type=json', video_id, +                    'Downloading %s stream JSON' +                    % item.get('label') or item.get('format') or format_id or num) +            except ExtractorError as ee: +                if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: +                    error = (self._parse_json( +                        ee.cause.read().decode(), video_id, +                        fatal=False) or {}).get('errorstring') +                    if error: +                        raise ExtractorError(error, expected=True) +                raise +            # Stream URL instead of JSON, example: npo:LI_NL1_4188102 +            if isinstance(stream_info, compat_str): +                if not stream_info.startswith('http'):                      continue -                streams = format_info.get('streams') -                if streams: -                    try: -                        video_info = self._download_json( -                            streams[0] + '&type=json', -                            video_id, 'Downloading %s stream JSON' % format_id) -                    except ExtractorError as ee: -                        if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: -                            error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring') -                            if error: -                                raise ExtractorError(error, expected=True) -                        raise -                else: -                    video_info = format_info -                video_url = video_info.get('url') -                if not video_url: +                video_url = stream_info +            # JSON +            else: +                video_url = stream_info.get('url') +            if not video_url or video_url in urls: +                continue +            urls.add(item_url) +            if determine_ext(video_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    video_url, video_id, ext='mp4', +                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) +            else: +                add_format_url(video_url) + +        is_live = metadata.get('medium') == 'live' + +        if not is_live: +            for num, stream in enumerate(metadata.get('streams', [])): +                stream_url = stream.get('url') +                if not stream_url or stream_url in urls:                      continue -                if format_id == 'adaptive': -                    formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) -                else: +                urls.add(stream_url) +                # smooth streaming is not supported +                stream_type = stream.get('type', '').lower() +                if stream_type in ['ss', 'ms']: +                    continue +                if stream_type == 'hds': +                    f4m_formats = self._extract_f4m_formats( +                        stream_url, video_id, fatal=False) +                    # f4m downloader downloads only piece of live stream +                    for f4m_format in f4m_formats: +                        f4m_format['preference'] = -1 +                    formats.extend(f4m_formats) +                elif stream_type == 'hls': +                    formats.extend(self._extract_m3u8_formats( +                        stream_url, video_id, ext='mp4', fatal=False)) +                # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 +                elif '.asf' in stream_url: +                    asx = self._download_xml( +                        stream_url, video_id, +                        'Downloading stream %d ASX playlist' % num, +                        transform_source=fix_xml_ampersands, fatal=False) +                    if not asx: +                        continue +                    ref = asx.find('./ENTRY/Ref') +                    if ref is None: +                        continue +                    video_url = ref.get('href') +                    if not video_url or video_url in urls: +                        continue +                    urls.add(video_url)                      formats.append({                          'url': video_url, -                        'format_id': format_id, -                        'quality': quality(format_id), +                        'ext': stream.get('formaat', 'asf'), +                        'quality': stream.get('kwaliteit'), +                        'preference': -10,                      }) - -        streams = metadata.get('streams') -        if streams: -            for i, stream in enumerate(streams): -                stream_url = stream.get('url') -                if not stream_url: -                    continue -                if '.asf' not in stream_url: +                else:                      formats.append({                          'url': stream_url,                          'quality': stream.get('kwaliteit'),                      }) -                    continue -                asx = self._download_xml( -                    stream_url, video_id, -                    'Downloading stream %d ASX playlist' % i, -                    transform_source=fix_xml_ampersands) -                ref = asx.find('./ENTRY/Ref') -                if ref is None: -                    continue -                video_url = ref.get('href') -                if not video_url: -                    continue -                formats.append({ -                    'url': video_url, -                    'ext': stream.get('formaat', 'asf'), -                    'quality': stream.get('kwaliteit'), -                })          self._sort_formats(formats) @@ -259,28 +300,28 @@ class NPOIE(NPOBaseIE):          return {              'id': video_id, -            'title': title, +            'title': self._live_title(title) if is_live else title,              'description': metadata.get('info'),              'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'],              'upload_date': unified_strdate(metadata.get('gidsdatum')),              'duration': parse_duration(metadata.get('tijdsduur')),              'formats': formats,              'subtitles': subtitles, +            'is_live': is_live,          }  class NPOLiveIE(NPOBaseIE):      IE_NAME = 'npo.nl:live' -    _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>.+)' +    _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)'      _TEST = {          'url': 'http://www.npo.nl/live/npo-1',          'info_dict': { -            'id': 'LI_NEDERLAND1_136692', +            'id': 'LI_NL1_4188102',              'display_id': 'npo-1',              'ext': 'mp4', -            'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'description': 'Livestream', +            'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',              'is_live': True,          },          'params': { @@ -296,58 +337,12 @@ class NPOLiveIE(NPOBaseIE):          live_id = self._search_regex(              r'data-prid="([^"]+)"', webpage, 'live id') -        metadata = self._download_json( -            'http://e.omroep.nl/metadata/%s' % live_id, -            display_id, transform_source=strip_jsonp) - -        token = self._get_token(display_id) - -        formats = [] - -        streams = metadata.get('streams') -        if streams: -            for stream in streams: -                stream_type = stream.get('type').lower() -                # smooth streaming is not supported -                if stream_type in ['ss', 'ms']: -                    continue -                stream_info = self._download_json( -                    'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' -                    % (stream.get('url'), token), -                    display_id, 'Downloading %s JSON' % stream_type) -                if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0): -                    continue -                stream_url = self._download_json( -                    stream_info['stream'], display_id, -                    'Downloading %s URL' % stream_type, -                    'Unable to download %s URL' % stream_type, -                    transform_source=strip_jsonp, fatal=False) -                if not stream_url: -                    continue -                if stream_type == 'hds': -                    f4m_formats = self._extract_f4m_formats(stream_url, display_id) -                    # f4m downloader downloads only piece of live stream -                    for f4m_format in f4m_formats: -                        f4m_format['preference'] = -1 -                    formats.extend(f4m_formats) -                elif stream_type == 'hls': -                    formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4')) -                else: -                    formats.append({ -                        'url': stream_url, -                        'preference': -10, -                    }) - -        self._sort_formats(formats) -          return { +            '_type': 'url_transparent', +            'url': 'npo:%s' % live_id, +            'ie_key': NPOIE.ie_key(),              'id': live_id,              'display_id': display_id, -            'title': self._live_title(metadata['titel']), -            'description': metadata['info'], -            'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], -            'formats': formats, -            'is_live': True,          } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index fc7ff43a6..58ffde541 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,22 +75,51 @@ class OpenloadIE(InfoExtractor):              '<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>',              webpage, 'openload ID') -        first_char = int(ol_id[0]) -        urlcode = [] -        num = 1 - -        while num < len(ol_id): -            i = ord(ol_id[num]) -            key = 0 -            if i <= 90: -                key = i - 65 -            elif i >= 97: -                key = 25 + i - 97 -            urlcode.append((key, compat_chr(int(ol_id[num + 2:num + 5]) // int(ol_id[num + 1]) - first_char))) -            num += 5 - -        video_url = 'https://openload.co/stream/' + ''.join( -            [value for _, value in sorted(urlcode, key=lambda x: x[0])]) +        video_url_chars = [] + +        first_char = ord(ol_id[0]) +        key = first_char - 55 +        maxKey = max(2, key) +        key = min(maxKey, len(ol_id) - 38) +        t = ol_id[key:key + 36] + +        hashMap = {} +        v = ol_id.replace(t, '') +        h = 0 + +        while h < len(t): +            f = t[h:h + 3] +            i = int(f, 8) +            hashMap[h / 3] = i +            h += 3 + +        h = 0 +        H = 0 +        while h < len(v): +            B = '' +            C = '' +            if len(v) >= h + 2: +                B = v[h:h + 2] +            if len(v) >= h + 3: +                C = v[h:h + 3] +            i = int(B, 16) +            h += 2 +            if H % 3 == 0: +                i = int(C, 8) +                h += 1 +            elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: +                i = int(C, 10) +                h += 1 +            index = H % 7 + +            A = hashMap[index] +            i ^= 213 +            i ^= A +            video_url_chars.append(compat_chr(i)) +            H += 1 + +        video_url = 'https://openload.co/stream/%s?mime=true' +        video_url = video_url % (''.join(video_url_chars))          title = self._og_search_title(webpage, default=None) or self._search_regex(              r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index e0cbd045e..e45d9fe55 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -40,7 +40,7 @@ class PluralsightIE(PluralsightBaseIE):          'info_dict': {              'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04',              'ext': 'mp4', -            'title': 'Management of SQL Server - Demo Monitoring', +            'title': 'Demo Monitoring',              'duration': 338,          },          'skip': 'Requires pluralsight account credentials', @@ -169,11 +169,10 @@ class PluralsightIE(PluralsightBaseIE):          collection = course['modules'] -        module, clip = None, None +        clip = None          for module_ in collection:              if name in (module_.get('moduleName'), module_.get('name')): -                module = module_                  for clip_ in module_.get('clips', []):                      clip_index = clip_.get('clipIndex')                      if clip_index is None: @@ -187,7 +186,7 @@ class PluralsightIE(PluralsightBaseIE):          if not clip:              raise ExtractorError('Unable to resolve clip') -        title = '%s - %s' % (module['title'], clip['title']) +        title = clip['title']          QUALITIES = {              'low': {'width': 640, 'height': 480}, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9b413590a..b25f1f193 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@  # coding: utf-8  from __future__ import unicode_literals +import functools  import itertools +import operator  # import os  import re @@ -18,6 +20,7 @@ from ..utils import (      js_to_json,      orderedSet,      # sanitized_Request, +    remove_quotes,      str_to_int,  )  # from ..aes import ( @@ -129,9 +132,32 @@ class PornHubIE(InfoExtractor):          tv_webpage = dl_webpage('tv') -        video_url = self._search_regex( -            r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage, -            'video url', group='url') +        assignments = self._search_regex( +            r'(var.+?mediastring.+?)</script>', tv_webpage, +            'encoded url').split(';') + +        js_vars = {} + +        def parse_js_value(inp): +            inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) +            if '+' in inp: +                inps = inp.split('+') +                return functools.reduce( +                    operator.concat, map(parse_js_value, inps)) +            inp = inp.strip() +            if inp in js_vars: +                return js_vars[inp] +            return remove_quotes(inp) + +        for assn in assignments: +            assn = assn.strip() +            if not assn: +                continue +            assn = re.sub(r'var\s+', '', assn) +            vname, value = assn.split('=', 1) +            js_vars[vname] = parse_js_value(value) + +        video_url = js_vars['mediastring']          title = self._search_regex(              r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 1245309a7..d8a4bd244 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -301,6 +301,21 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):              },          },          { +            # title in <h2 class="subtitle"> +            'url': 'http://www.prosieben.de/stars/oscar-award/videos/jetzt-erst-enthuellt-das-geheimnis-von-emma-stones-oscar-robe-clip', +            'info_dict': { +                'id': '4895826', +                'ext': 'mp4', +                'title': 'Jetzt erst enthüllt: Das Geheimnis von Emma Stones Oscar-Robe', +                'description': 'md5:e5ace2bc43fadf7b63adc6187e9450b9', +                'upload_date': '20170302', +            }, +            'params': { +                'skip_download': True, +            }, +            'skip': 'geo restricted to Germany', +        }, +        {              # geo restricted to Germany              'url': 'http://www.kabeleinsdoku.de/tv/mayday-alarm-im-cockpit/video/102-notlandung-im-hudson-river-ganze-folge',              'only_matching': True, @@ -338,6 +353,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):          r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',          r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>',          r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>', +        r'<h2[^>]+class="subtitle"[^>]*>([^<]+)</h2>',      ]      _DESCRIPTION_REGEXES = [          r'<p itemprop="description">\s*(.+?)</p>', @@ -369,7 +385,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):      def _extract_clip(self, url, webpage):          clip_id = self._html_search_regex(              self._CLIPID_REGEXES, webpage, 'clip id') -        title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') +        title = self._html_search_regex( +            self._TITLE_REGEXES, webpage, 'title', +            default=None) or self._og_search_title(webpage)          info = self._extract_video_info(url, clip_id)          description = self._html_search_regex(              self._DESCRIPTION_REGEXES, webpage, 'description', default=None) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py new file mode 100644 index 000000000..afab62426 --- /dev/null +++ b/youtube_dl/extractor/redbulltv.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( +    float_or_none, +    int_or_none, +    try_get, +    # unified_timestamp, +    ExtractorError, +) + + +class RedBullTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?P<id>AP-\w+)' +    _TESTS = [{ +        # film +        'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', +        'md5': 'fb0445b98aa4394e504b413d98031d1f', +        'info_dict': { +            'id': 'AP-1Q756YYX51W11', +            'ext': 'mp4', +            'title': 'ABC of...WRC', +            'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', +            'duration': 1582.04, +            # 'timestamp': 1488405786, +            # 'upload_date': '20170301', +        }, +    }, { +        # episode +        'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web', +        'info_dict': { +            'id': 'AP-1PMT5JCWH1W11', +            'ext': 'mp4', +            'title': 'Grime - Hashtags S2 E4', +            'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', +            'duration': 904.6, +            # 'timestamp': 1487290093, +            # 'upload_date': '20170217', +            'series': 'Hashtags', +            'season_number': 2, +            'episode_number': 4, +        }, +    }, { +        'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        session = self._download_json( +            'https://api-v2.redbull.tv/session', video_id, +            note='Downloading access token', query={ +                'build': '4.370.0', +                'category': 'personal_computer', +                'os_version': '1.0', +                'os_family': 'http', +            }) +        if session.get('code') == 'error': +            raise ExtractorError('%s said: %s' % ( +                self.IE_NAME, session['message'])) +        auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token']) + +        try: +            info = self._download_json( +                'https://api-v2.redbull.tv/content/%s' % video_id, +                video_id, note='Downloading video information', +                headers={'Authorization': auth} +            ) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: +                error_message = self._parse_json( +                    e.cause.read().decode(), video_id)['message'] +                raise ExtractorError('%s said: %s' % ( +                    self.IE_NAME, error_message), expected=True) +            raise + +        video = info['video_product'] + +        title = info['title'].strip() + +        formats = self._extract_m3u8_formats( +            video['url'], video_id, 'mp4', 'm3u8_native') +        self._sort_formats(formats) + +        subtitles = {} +        for _, captions in (try_get( +                video, lambda x: x['attachments']['captions'], +                dict) or {}).items(): +            if not captions or not isinstance(captions, list): +                continue +            for caption in captions: +                caption_url = caption.get('url') +                if not caption_url: +                    continue +                ext = caption.get('format') +                if ext == 'xml': +                    ext = 'ttml' +                subtitles.setdefault(caption.get('lang') or 'en', []).append({ +                    'url': caption_url, +                    'ext': ext, +                }) + +        subheading = info.get('subheading') +        if subheading: +            title += ' - %s' % subheading + +        return { +            'id': video_id, +            'title': title, +            'description': info.get('long_description') or info.get( +                'short_description'), +            'duration': float_or_none(video.get('duration'), scale=1000), +            # 'timestamp': unified_timestamp(info.get('published')), +            'series': info.get('show_title'), +            'season_number': int_or_none(info.get('season_number')), +            'episode_number': int_or_none(info.get('episode_number')), +            'formats': formats, +            'subtitles': subtitles, +        } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index fd1df925b..889fa7628 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -17,7 +17,7 @@ from ..utils import (  class RutubeIE(InfoExtractor):      IE_NAME = 'rutube'      IE_DESC = 'Rutube videos' -    _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P<id>[\da-z]{32})' +    _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'      _TESTS = [{          'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', @@ -39,8 +39,17 @@ class RutubeIE(InfoExtractor):      }, {          'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',          'only_matching': True, +    }, { +        'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', +        'only_matching': True,      }] +    @staticmethod +    def _extract_urls(webpage): +        return [mobj.group('url') for mobj in re.finditer( +            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', +            webpage)] +      def _real_extract(self, url):          video_id = self._match_id(url)          video = self._download_json( diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 20d01754a..6c09df25a 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -82,6 +82,9 @@ class RuutuIE(InfoExtractor):                          formats.extend(self._extract_f4m_formats(                              video_url, video_id, f4m_id='hds', fatal=False))                      elif ext == 'mpd': +                        # video-only and audio-only streams are of different +                        # duration resulting in out of sync issue +                        continue                          formats.extend(self._extract_mpd_formats(                              video_url, video_id, mpd_id='dash', fatal=False))                      else: diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 387a4f7f6..db5ef8b57 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -89,7 +89,7 @@ class SenateISVPIE(InfoExtractor):      @staticmethod      def _search_iframe_url(webpage):          mobj = re.search( -            r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", +            r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]",              webpage)          if mobj:              return mobj.group('url') diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index b3aa4ce26..0ee4a8ff8 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -121,7 +121,7 @@ class SoundcloudIE(InfoExtractor):          },      ] -    _CLIENT_ID = 'fDoItMDbsbZz8dY16ZzARCZmzgHBPotA' +    _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z'      _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'      @staticmethod diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index e973c867c..9f5c237ef 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -65,7 +65,7 @@ class StreamableIE(InfoExtractor):          # to return video info like the title properly sometimes, and doesn't          # include info like the video duration          video = self._download_json( -            'https://streamable.com/ajax/videos/%s' % video_id, video_id) +            'https://ajax.streamable.com/videos/%s' % video_id, video_id)          # Format IDs:          # 0 The video is being uploaded diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index d5abfc9e4..fdcc7d573 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -44,6 +44,10 @@ class TelecincoIE(MiTeleBaseIE):      }, {          'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html',          'only_matching': True, +    }, { +        # ooyala video +        'url': 'http://www.cuatro.com/chesterinlove/a-carta/chester-chester_in_love-chester_edu_2_2331030022.html', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index 82d73c31d..fafaa826f 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -2,15 +2,17 @@  from __future__ import unicode_literals  from .common import InfoExtractor +from ..compat import compat_str  from ..utils import (      int_or_none,      smuggle_url, +    try_get,  )  class TeleQuebecIE(InfoExtractor):      _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york',          'md5': 'fe95a0957e5707b1b01f5013e725c90f',          'info_dict': { @@ -18,10 +20,14 @@ class TeleQuebecIE(InfoExtractor):              'ext': 'mp4',              'title': 'Le couronnement de New York',              'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', -            'upload_date': '20160220', -            'timestamp': 1455965438, +            'upload_date': '20170201', +            'timestamp': 1485972222,          } -    } +    }, { +        # no description +        'url': 'http://zonevideo.telequebec.tv/media/30261', +        'only_matching': True, +    }]      def _real_extract(self, url):          media_id = self._match_id(url) @@ -31,9 +37,13 @@ class TeleQuebecIE(InfoExtractor):          return {              '_type': 'url_transparent',              'id': media_id, -            'url': smuggle_url('limelight:media:' + media_data['streamInfo']['sourceId'], {'geo_countries': ['CA']}), +            'url': smuggle_url( +                'limelight:media:' + media_data['streamInfo']['sourceId'], +                {'geo_countries': ['CA']}),              'title': media_data['title'], -            'description': media_data.get('descriptions', [{'text': None}])[0].get('text'), -            'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000), +            'description': try_get( +                media_data, lambda x: x['descriptions'][0]['text'], compat_str), +            'duration': int_or_none( +                media_data.get('durationInMilliseconds'), 1000),              'ie_key': 'LimelightMedia',          } diff --git a/youtube_dl/extractor/toongoggles.py b/youtube_dl/extractor/toongoggles.py new file mode 100644 index 000000000..b5ba1c01d --- /dev/null +++ b/youtube_dl/extractor/toongoggles.py @@ -0,0 +1,81 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    parse_duration, +) + + +class ToonGogglesIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?toongoggles\.com/shows/(?P<show_id>\d+)(?:/[^/]+/episodes/(?P<episode_id>\d+))?' +    _TESTS = [{ +        'url': 'http://www.toongoggles.com/shows/217143/bernard-season-2/episodes/217147/football', +        'md5': '18289fc2b951eff6b953a9d8f01e6831', +        'info_dict': { +            'id': '217147', +            'ext': 'mp4', +            'title': 'Football', +            'uploader_id': '1', +            'description': 'Bernard decides to play football in order to be better than Lloyd and tries to beat him no matter how, he even cheats.', +            'upload_date': '20160718', +            'timestamp': 1468879330, +        } +    }, { +        'url': 'http://www.toongoggles.com/shows/227759/om-nom-stories-around-the-world', +        'info_dict': { +            'id': '227759', +            'title': 'Om Nom Stories Around The World', +        }, +        'playlist_mincount': 11, +    }] + +    def _call_api(self, action, page_id, query): +        query.update({ +            'for_ng': 1, +            'for_web': 1, +            'show_meta': 1, +            'version': 7.0, +        }) +        return self._download_json('http://api.toongoggles.com/' + action, page_id, query=query) + +    def _parse_episode_data(self, episode_data): +        title = episode_data['episode_name'] + +        return { +            '_type': 'url_transparent', +            'id': episode_data['episode_id'], +            'title': title, +            'url': 'kaltura:513551:' + episode_data['entry_id'], +            'thumbnail': episode_data.get('thumbnail_url'), +            'description': episode_data.get('description'), +            'duration': parse_duration(episode_data.get('hms')), +            'series': episode_data.get('show_name'), +            'season_number': int_or_none(episode_data.get('season_num')), +            'episode_id': episode_data.get('episode_id'), +            'episode': title, +            'episode_number': int_or_none(episode_data.get('episode_num')), +            'categories': episode_data.get('categories'), +            'ie_key': 'Kaltura', +        } + +    def _real_extract(self, url): +        show_id, episode_id = re.match(self._VALID_URL, url).groups() +        if episode_id: +            episode_data = self._call_api('search', episode_id, { +                'filter': 'episode', +                'id': episode_id, +            })['objects'][0] +            return self._parse_episode_data(episode_data) +        else: +            show_data = self._call_api('getepisodesbyshow', show_id, { +                'max': 1000000000, +                'showid': show_id, +            }) +            entries = [] +            for episode_data in show_data.get('objects', []): +                entries.append(self._parse_episode_data(episode_data)) +            return self.playlist_result(entries, show_id, show_data.get('show_name')) diff --git a/youtube_dl/extractor/tunepk.py b/youtube_dl/extractor/tunepk.py new file mode 100644 index 000000000..9d42651ce --- /dev/null +++ b/youtube_dl/extractor/tunepk.py @@ -0,0 +1,90 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    int_or_none, +    try_get, +    unified_timestamp, +) + + +class TunePkIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)| +                            embed\.tune\.pk/play/ +                        ) +                        (?P<id>\d+) +                    ''' +    _TESTS = [{ +        'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins', +        'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55', +        'info_dict': { +            'id': '6919541', +            'ext': 'mp4', +            'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins', +            'description': 'md5:eb5a04114fafef5cec90799a93a2d09c', +            'thumbnail': r're:^https?://.*\.jpg$', +            'timestamp': 1487327564, +            'upload_date': '20170217', +            'uploader': 'Movie Trailers', +            'duration': 107, +            'view_count': int, +        } +    }, { +        'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no', +        'only_matching': True, +    }, { +        'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage( +            'https://tune.pk/video/%s' % video_id, video_id) + +        details = self._parse_json( +            self._search_regex( +                r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'), +            video_id)['details'] + +        video = details['video'] +        title = video.get('title') or self._og_search_title( +            webpage, default=None) or self._html_search_meta( +            'title', webpage, 'title', fatal=True) + +        formats = self._parse_jwplayer_formats( +            details['player']['sources'], video_id) +        self._sort_formats(formats) + +        description = self._og_search_description( +            webpage, default=None) or self._html_search_meta( +            'description', webpage, 'description') + +        thumbnail = video.get('thumb') or self._og_search_thumbnail( +            webpage, default=None) or self._html_search_meta( +            'thumbnail', webpage, 'thumbnail') + +        timestamp = unified_timestamp(video.get('date_added')) +        uploader = try_get( +            video, lambda x: x['uploader']['name'], +            compat_str) or self._html_search_meta('author', webpage, 'author') + +        duration = int_or_none(video.get('duration')) +        view_count = int_or_none(video.get('views')) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'timestamp': timestamp, +            'uploader': uploader, +            'duration': duration, +            'view_count': view_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index f3541b654..7af11659f 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -1,6 +1,8 @@  # coding: utf-8  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      parse_iso8601, @@ -12,7 +14,7 @@ from ..utils import (  class TwentyFourVideoIE(InfoExtractor):      IE_NAME = '24video' -    _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex|tube)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' +    _VALID_URL = r'https?://(?P<host>(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'      _TESTS = [{          'url': 'http://www.24video.net/video/view/1044982', @@ -43,10 +45,12 @@ class TwentyFourVideoIE(InfoExtractor):      }]      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        host = mobj.group('host')          webpage = self._download_webpage( -            'http://www.24video.sex/video/view/%s' % video_id, video_id) +            'http://%s/video/view/%s' % (host, video_id), video_id)          title = self._og_search_title(webpage)          description = self._html_search_regex( @@ -72,11 +76,11 @@ class TwentyFourVideoIE(InfoExtractor):          # Sets some cookies          self._download_xml( -            r'http://www.24video.sex/video/xml/%s?mode=init' % video_id, +            r'http://%s/video/xml/%s?mode=init' % (host, video_id),              video_id, 'Downloading init XML')          video_xml = self._download_xml( -            'http://www.24video.sex/video/xml/%s?mode=play' % video_id, +            'http://%s/video/xml/%s?mode=play' % (host, video_id),              video_id, 'Downloading video XML')          video = xpath_element(video_xml, './/video', 'video', fatal=True) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bbba394b0..2daf9dfac 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -12,7 +12,6 @@ from ..compat import (      compat_str,      compat_urllib_parse_urlencode,      compat_urllib_parse_urlparse, -    compat_urlparse,  )  from ..utils import (      clean_html, @@ -24,6 +23,7 @@ from ..utils import (      parse_iso8601,      update_url_query,      urlencode_postdata, +    urljoin,  ) @@ -32,7 +32,7 @@ class TwitchBaseIE(InfoExtractor):      _API_BASE = 'https://api.twitch.tv'      _USHER_BASE = 'https://usher.ttvnw.net' -    _LOGIN_URL = 'http://www.twitch.tv/login' +    _LOGIN_URL = 'https://www.twitch.tv/login'      _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6'      _NETRC_MACHINE = 'twitch' @@ -64,6 +64,35 @@ class TwitchBaseIE(InfoExtractor):              raise ExtractorError(                  'Unable to login. Twitch said: %s' % message, expected=True) +        def login_step(page, urlh, note, data): +            form = self._hidden_inputs(page) +            form.update(data) + +            page_url = urlh.geturl() +            post_url = self._search_regex( +                r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, +                'post url', default=page_url, group='url') +            post_url = urljoin(page_url, post_url) + +            headers = {'Referer': page_url} + +            try: +                response = self._download_json( +                    post_url, None, note, +                    data=urlencode_postdata(form), +                    headers=headers) +            except ExtractorError as e: +                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: +                    response = self._parse_json( +                        e.cause.read().decode('utf-8'), None) +                    fail(response['message']) +                raise + +            redirect_url = urljoin(post_url, response['redirect']) +            return self._download_webpage_handle( +                redirect_url, None, 'Downloading login redirect page', +                headers=headers) +          login_page, handle = self._download_webpage_handle(              self._LOGIN_URL, None, 'Downloading login page') @@ -71,40 +100,19 @@ class TwitchBaseIE(InfoExtractor):          if 'blacklist_message' in login_page:              fail(clean_html(login_page)) -        login_form = self._hidden_inputs(login_page) - -        login_form.update({ -            'username': username, -            'password': password, -        }) - -        redirect_url = handle.geturl() - -        post_url = self._search_regex( -            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, -            'post url', default=redirect_url, group='url') - -        if not post_url.startswith('http'): -            post_url = compat_urlparse.urljoin(redirect_url, post_url) - -        headers = {'Referer': redirect_url} +        redirect_page, handle = login_step( +            login_page, handle, 'Logging in as %s' % username, { +                'username': username, +                'password': password, +            }) -        try: -            response = self._download_json( -                post_url, None, 'Logging in as %s' % username, -                data=urlencode_postdata(login_form), -                headers=headers) -        except ExtractorError as e: -            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: -                response = self._parse_json( -                    e.cause.read().decode('utf-8'), None) -                fail(response['message']) -            raise - -        if response.get('redirect'): -            self._download_webpage( -                response['redirect'], None, 'Downloading login redirect page', -                headers=headers) +        if re.search(r'(?i)<form[^>]+id="two-factor-submit"', redirect_page) is not None: +            # TODO: Add mechanism to request an SMS or phone call +            tfa_token = self._get_tfa_info('two-factor authentication token') +            login_step(redirect_page, handle, 'Submitting TFA token', { +                'authy_token': tfa_token, +                'remember_2fa': 'true', +            })      def _prefer_source(self, formats):          try: diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index d26fb49b3..5086f591e 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -9,7 +9,7 @@ from .common import InfoExtractor  class VierIE(InfoExtractor):      IE_NAME = 'vier' -    _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' +    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'      _TESTS = [{          'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',          'info_dict': { @@ -24,6 +24,19 @@ class VierIE(InfoExtractor):              'skip_download': True,          },      }, { +        'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', +        'info_dict': { +            'id': '2561614', +            'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', +            'ext': 'mp4', +            'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', +            'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, {          'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',          'only_matching': True,      }, { @@ -35,6 +48,7 @@ class VierIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          embed_id = mobj.group('embed_id')          display_id = mobj.group('display_id') or embed_id +        site = mobj.group('site')          webpage = self._download_webpage(url, display_id) @@ -43,7 +57,7 @@ class VierIE(InfoExtractor):              webpage, 'video id')          application = self._search_regex(              [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], -            webpage, 'application', default='vier_vod') +            webpage, 'application', default=site + '_vod')          filename = self._search_regex(              [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],              webpage, 'filename') @@ -68,7 +82,7 @@ class VierIE(InfoExtractor):  class VierVideosIE(InfoExtractor):      IE_NAME = 'vier:videos' -    _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' +    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'      _TESTS = [{          'url': 'http://www.vier.be/demoestuin/videos',          'info_dict': { @@ -76,6 +90,12 @@ class VierVideosIE(InfoExtractor):          },          'playlist_mincount': 153,      }, { +        'url': 'http://www.vijf.be/temptationisland/videos', +        'info_dict': { +            'id': 'temptationisland', +        }, +        'playlist_mincount': 159, +    }, {          'url': 'http://www.vier.be/demoestuin/videos?page=6',          'info_dict': {              'id': 'demoestuin-page6', @@ -92,6 +112,7 @@ class VierVideosIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          program = mobj.group('program') +        site = mobj.group('site')          page_id = mobj.group('page')          if page_id: @@ -105,13 +126,13 @@ class VierVideosIE(InfoExtractor):          entries = []          for current_page_id in itertools.count(start_page):              current_page = self._download_webpage( -                'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), +                'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),                  program,                  'Downloading page %d' % (current_page_id + 1))              page_entries = [ -                self.url_result('http://www.vier.be' + video_url, 'Vier') +                self.url_result('http://www.' + site + '.be' + video_url, 'Vier')                  for video_url in re.findall( -                    r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] +                    r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]              entries.extend(page_entries)              if page_id or '>Meer<' not in current_page:                  break diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py index 3fd889c8e..db6a65d2e 100644 --- a/youtube_dl/extractor/viu.py +++ b/youtube_dl/extractor/viu.py @@ -44,7 +44,7 @@ class ViuBaseIE(InfoExtractor):  class ViuIE(ViuBaseIE): -    _VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)' +    _VALID_URL = r'(?:viu:|https?://[^/]+\.viu\.com/[a-z]{2}/media/)(?P<id>\d+)'      _TESTS = [{          'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059',          'info_dict': { @@ -69,6 +69,9 @@ class ViuIE(ViuBaseIE):              'skip_download': 'm3u8 download',          },          'skip': 'Geo-restricted to Indonesia', +    }, { +        'url': 'https://india.viu.com/en/media/1126286865', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 7c42a4f54..dc2719cf9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -432,8 +432,7 @@ class VKIE(VKBaseIE):                  })              elif format_id == 'hls':                  formats.extend(self._extract_m3u8_formats( -                    format_url, video_id, 'mp4', -                    entry_protocol='m3u8' if is_live else 'm3u8_native', +                    format_url, video_id, 'mp4', 'm3u8_native',                      m3u8_id=format_id, fatal=False, live=is_live))              elif format_id == 'rtmp':                  formats.append({ diff --git a/youtube_dl/extractor/vrak.py b/youtube_dl/extractor/vrak.py new file mode 100644 index 000000000..daa247cce --- /dev/null +++ b/youtube_dl/extractor/vrak.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..utils import ( +    int_or_none, +    parse_age_limit, +    smuggle_url, +    unescapeHTML, +) + + +class VrakIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P<id>[\d.]+)' +    _TEST = { +        'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721', +        'info_dict': { +            'id': '5345661243001', +            'ext': 'mp4', +            'title': 'Obésité, film de hockey et Roseline Filion', +            'timestamp': 1488492126, +            'upload_date': '20170302', +            'uploader_id': '2890187628001', +            'creator': 'VRAK.TV', +            'age_limit': 8, +            'series': 'ALT (Actualité Légèrement Tordue)', +            'episode': 'Obésité, film de hockey et Roseline Filion', +            'tags': list, +        }, +        'params': { +            'skip_download': True, +        }, +    } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s' + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_regex( +            r'<h\d\b[^>]+\bclass=["\']videoTitle["\'][^>]*>([^<]+)', +            webpage, 'title', default=None) or self._og_search_title(webpage) + +        content = self._parse_json( +            self._search_regex( +                r'data-player-options-content=(["\'])(?P<content>{.+?})\1', +                webpage, 'content', default='{}', group='content'), +            video_id, transform_source=unescapeHTML) + +        ref_id = content.get('refId') or self._search_regex( +            r'refId":"([^&]+)"', webpage, 'ref id') + +        brightcove_id = self._search_regex( +            r'''(?x) +                java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s +                [^>]* +                java\.lang\.String\s+value\s*=\s*["'](\d+) +            ''' % re.escape(ref_id), webpage, 'brightcove id') + +        return { +            '_type': 'url_transparent', +            'ie_key': BrightcoveNewIE.ie_key(), +            'url': smuggle_url( +                self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, +                {'geo_countries': ['CA']}), +            'id': brightcove_id, +            'description': content.get('description'), +            'creator': content.get('brand'), +            'age_limit': parse_age_limit(content.get('rating')), +            'series': content.get('showName') or content.get( +                'episodeName'),  # this is intentional +            'season_number': int_or_none(content.get('seasonNumber')), +            'episode': title, +            'episode_number': int_or_none(content.get('episodeNumber')), +            'tags': content.get('tags', []), +        } diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index f7e6360a3..8bb7362bb 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -19,9 +19,10 @@ class WDRBaseIE(InfoExtractor):      def _extract_wdr_video(self, webpage, display_id):          # for wdr.de the data-extension is in a tag with the class "mediaLink"          # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" -        # for wdrmaus its in a link to the page in a multiline "videoLink"-tag +        # for wdrmaus, in a tag with the class "videoButton" (previously a link +        # to the page in a multiline "videoLink"-tag)          json_metadata = self._html_search_regex( -            r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', +            r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"',              webpage, 'media link', default=None, flags=re.MULTILINE)          if not json_metadata: @@ -32,7 +33,7 @@ class WDRBaseIE(InfoExtractor):          jsonp_url = media_link_obj['mediaObj']['url']          metadata = self._download_json( -            jsonp_url, 'metadata', transform_source=strip_jsonp) +            jsonp_url, display_id, transform_source=strip_jsonp)          metadata_tracker_data = metadata['trackerData']          metadata_media_resource = metadata['mediaResource'] @@ -161,23 +162,23 @@ class WDRIE(WDRBaseIE):          {              'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',              'info_dict': { -                'id': 'mdb-1096487', -                'ext': 'flv', +                'id': 'mdb-1323501', +                'ext': 'mp4',                  'upload_date': 're:^[0-9]{8}$',                  'title': 're:^Die Sendung mit der Maus vom [0-9.]{10}$', -                'description': '- Die Sendung mit der Maus -', +                'description': 'Die Seite mit der Maus -',              },              'skip': 'The id changes from week to week because of the new episode'          },          { -            'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', +            'url': 'http://www.wdrmaus.de/filme/sachgeschichten/achterbahn.php5',              'md5': '803138901f6368ee497b4d195bb164f2',              'info_dict': {                  'id': 'mdb-186083',                  'ext': 'mp4',                  'upload_date': '20130919',                  'title': 'Sachgeschichte - Achterbahn ', -                'description': '- Die Sendung mit der Maus -', +                'description': 'Die Seite mit der Maus -',              },          },          { @@ -186,7 +187,7 @@ class WDRIE(WDRBaseIE):              'info_dict': {                  'id': 'mdb-869971',                  'ext': 'flv', -                'title': 'Funkhaus Europa Livestream', +                'title': 'COSMO Livestream',                  'description': 'md5:2309992a6716c347891c045be50992e4',                  'upload_date': '20160101',              }, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 81c793921..ca40de522 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -59,6 +59,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      # If True it will raise an error if no login info is provided      _LOGIN_REQUIRED = False +    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' +      def _set_language(self):          self._set_cookie(              '.youtube.com', 'PREF', 'f1=50000000&hl=en', @@ -265,9 +267,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                           )                       )?                                                       # all until now is optional -> you can pass the naked ID                       ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID -                     (?!.*?\blist=)                                            # combined list/video URLs are handled by the playlist IE +                     (?!.*?\blist= +                        (?: +                            %(playlist_id)s|                                  # combined list/video URLs are handled by the playlist IE +                            WL                                                # WL are handled by the watch later IE +                        ) +                     )                       (?(1).+)?                                                # if we found the ID, everything can follow -                     $""" +                     $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}      _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'      _formats = {          '5': {'ext': 'flv', 'width': 400, 'height': 240, 'acodec': 'mp3', 'abr': 64, 'vcodec': 'h263'}, @@ -924,6 +931,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'url': 'sJL6WA-aGkQ',              'only_matching': True,          }, +        { +            'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', +            'only_matching': True, +        },      ]      def __init__(self, *args, **kwargs): @@ -1454,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # Check for "rental" videos          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: -            raise ExtractorError('"rental" videos not supported') +            raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True)          # Start extracting information          self.report_information_extraction(video_id) @@ -1864,8 +1875,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):                          )                          .*                       | -                        ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) -                     )""" +                        (%(playlist_id)s) +                     )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true'      _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'      IE_NAME = 'youtube:playlist' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 8b51d3c6f..6b811535f 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -773,7 +773,7 @@ def parseOpts(overrideArguments=None):          help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)')      postproc.add_option(          '--audio-format', metavar='FORMAT', dest='audioformat', default='best', -        help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default; No effect without -x') +        help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x')      postproc.add_option(          '--audio-quality', metavar='QUALITY',          dest='audioquality', default='5', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 96ddb3b36..7c162d92a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -26,15 +26,25 @@ from ..utils import (  EXT_TO_OUT_FORMATS = { -    "aac": "adts", -    "m4a": "ipod", -    "mka": "matroska", -    "mkv": "matroska", -    "mpg": "mpeg", -    "ogv": "ogg", -    "ts": "mpegts", -    "wma": "asf", -    "wmv": "asf", +    'aac': 'adts', +    'flac': 'flac', +    'm4a': 'ipod', +    'mka': 'matroska', +    'mkv': 'matroska', +    'mpg': 'mpeg', +    'ogv': 'ogg', +    'ts': 'mpegts', +    'wma': 'asf', +    'wmv': 'asf', +} +ACODECS = { +    'mp3': 'libmp3lame', +    'aac': 'aac', +    'flac': 'flac', +    'm4a': 'aac', +    'opus': 'opus', +    'vorbis': 'libvorbis', +    'wav': None,  } @@ -237,7 +247,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):                  acodec = 'copy'                  extension = 'm4a'                  more_opts = ['-bsf:a', 'aac_adtstoasc'] -            elif filecodec in ['aac', 'mp3', 'vorbis', 'opus']: +            elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']:                  # Lossless if possible                  acodec = 'copy'                  extension = filecodec @@ -256,8 +266,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):                      else:                          more_opts += ['-b:a', self._preferredquality + 'k']          else: -            # We convert the audio (lossy) -            acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'opus': 'opus', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec] +            # We convert the audio (lossy if codec is lossy) +            acodec = ACODECS[self._preferredcodec]              extension = self._preferredcodec              more_opts = []              if self._preferredquality is not None: diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 807183f4a..2340bc306 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -39,6 +39,7 @@ from .compat import (      compat_basestring,      compat_chr,      compat_etree_fromstring, +    compat_expanduser,      compat_html_entities,      compat_html_entities_html5,      compat_http_client, @@ -473,7 +474,8 @@ def timeconvert(timestr):  def sanitize_filename(s, restricted=False, is_id=False):      """Sanitizes a string so it could be used as part of a filename.      If restricted is set, use a stricter subset of allowed characters. -    Set is_id if this is not an arbitrary string, but an ID that should be kept if possible +    Set is_id if this is not an arbitrary string, but an ID that should be kept +    if possible.      """      def replace_insane(char):          if restricted and char in ACCENT_CHARS: @@ -538,6 +540,11 @@ def sanitized_Request(url, *args, **kwargs):      return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) +def expand_path(s): +    """Expand shell variables and ~""" +    return os.path.expandvars(compat_expanduser(s)) + +  def orderedSet(iterable):      """ Remove all duplicates from the input iterable """      res = [] @@ -1747,11 +1754,16 @@ def base_url(url):  def urljoin(base, path): +    if isinstance(path, bytes): +        path = path.decode('utf-8')      if not isinstance(path, compat_str) or not path:          return None      if re.match(r'^(?:https?:)?//', path):          return path -    if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): +    if isinstance(base, bytes): +        base = base.decode('utf-8') +    if not isinstance(base, compat_str) or not re.match( +            r'^(?:https?:)?//', base):          return None      return compat_urlparse.urljoin(base, path) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 261218b80..13904c724 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2017.02.27' +__version__ = '2017.03.24' | 
