diff options
53 files changed, 1022 insertions, 272 deletions
| diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 42333c450..f8ab29631 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,7 +125,7 @@ If you want to add support for a new site, you can follow this quick list (assum      ```  5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).  6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want.  8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8).  9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: @@ -34,7 +34,7 @@ You can also use pip:      sudo pip install youtube-dl -Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . +Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html .  # DESCRIPTION  **youtube-dl** is a small command-line program to download videos from @@ -207,7 +207,7 @@ which means you can modify it, redistribute it or use it however you like.      -p, --password PASSWORD          Account password. If this option is left out, youtube-dl will ask interactively.      -2, --twofactor TWOFACTOR        Two-factor auth code      -n, --netrc                      Use .netrc authentication data -    --video-password PASSWORD        Video password (vimeo, smotri) +    --video-password PASSWORD        Video password (vimeo, smotri, youku)  ## Post-processing Options:      -x, --extract-audio              Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe) @@ -552,7 +552,7 @@ If you want to add support for a new site, you can follow this quick list (assum      ```  5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).  6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want.  8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8).  9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9099e2da4..04b9959ac 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -166,7 +166,7 @@   - **Folketinget**: Folketinget (ft.dk; Danish parliament)   - **FootyRoom**   - **Foxgay** - - **FoxNews** + - **FoxNews**: Fox News and Fox Business Video   - **FoxSports**   - **france2.fr:generation-quoi**   - **FranceCulture** @@ -220,6 +220,7 @@   - **imdb**: Internet Movie Database trailers   - **imdb:list**: Internet Movie Database lists   - **Imgur** + - **ImgurAlbum**   - **Ina**   - **Indavideo**   - **IndavideoEmbed** @@ -301,13 +302,16 @@   - **Moviezine**   - **movshare**: MovShare   - **MPORA** + - **MSNBC**   - **MTV** + - **mtv.de**   - **mtviggy.com**   - **mtvservices:embedded**   - **MuenchenTV**: münchen.tv   - **MusicPlayOn**   - **MusicVault**   - **muzu.tv** + - **Mwave**   - **MySpace**   - **MySpace:album**   - **MySpass** @@ -392,6 +396,8 @@   - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz   - **Playvid**   - **Playwire** + - **pluralsight** + - **pluralsight:course**   - **plus.google**: Google Plus   - **pluzz.francetv.fr**   - **podomatic** @@ -461,7 +467,7 @@   - **Sexu**   - **SexyKarma**: Sexy Karma and Watch Indian Porn   - **Shahid** - - **Shared** + - **Shared**: shared.sx and vivo.sx   - **ShareSix**   - **Sina**   - **Slideshare** @@ -534,6 +540,7 @@   - **TF1**   - **TheOnion**   - **ThePlatform** + - **ThePlatformFeed**   - **TheSixtyOne**   - **ThisAmericanLife**   - **ThisAV** @@ -599,7 +606,6 @@   - **Viddler**   - **video.google:search**: Google Video search   - **video.mit.edu** - - **VideoBam**   - **VideoDetective**   - **videofy.me**   - **videolectures.net** diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c4e3adb67..0343967d9 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -25,6 +25,7 @@ from youtube_dl.extractor import (      RaiIE,      VikiIE,      ThePlatformIE, +    ThePlatformFeedIE,      RTVEALaCartaIE,      FunnyOrDieIE,  ) @@ -307,6 +308,18 @@ class TestThePlatformSubtitles(BaseTestSubtitles):          self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +class TestThePlatformFeedSubtitles(BaseTestSubtitles): +    url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' +    IE = ThePlatformFeedIE + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['en'])) +        self.assertEqual(md5(subtitles['en']), '48649a22e82b2da21c9a67a395eedade') + +  class TestRtveSubtitles(BaseTestSubtitles):      url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/'      IE = RTVEALaCartaIE diff --git a/test/test_utils.py b/test/test_utils.py index a759b2da9..a5f164c49 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -57,11 +57,16 @@ from youtube_dl.utils import (      urlencode_postdata,      version_tuple,      xpath_with_ns, +    xpath_element,      xpath_text, +    xpath_attr,      render_table,      match_str,      parse_dfxp_time_expr,      dfxp2srt, +    cli_option, +    cli_valueless_option, +    cli_bool_option,  ) @@ -264,6 +269,16 @@ class TestUtil(unittest.TestCase):          self.assertEqual(find('media:song/media:author').text, 'The Author')          self.assertEqual(find('media:song/url').text, 'http://server.com/download.mp3') +    def test_xpath_element(self): +        doc = xml.etree.ElementTree.Element('root') +        div = xml.etree.ElementTree.SubElement(doc, 'div') +        p = xml.etree.ElementTree.SubElement(div, 'p') +        p.text = 'Foo' +        self.assertEqual(xpath_element(doc, 'div/p'), p) +        self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') +        self.assertTrue(xpath_element(doc, 'div/bar') is None) +        self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) +      def test_xpath_text(self):          testxml = '''<root>              <div> @@ -272,9 +287,25 @@ class TestUtil(unittest.TestCase):          </root>'''          doc = xml.etree.ElementTree.fromstring(testxml)          self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') +        self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default')          self.assertTrue(xpath_text(doc, 'div/bar') is None)          self.assertRaises(ExtractorError, xpath_text, doc, 'div/bar', fatal=True) +    def test_xpath_attr(self): +        testxml = '''<root> +            <div> +                <p x="a">Foo</p> +            </div> +        </root>''' +        doc = xml.etree.ElementTree.fromstring(testxml) +        self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') +        self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) +        self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) +        self.assertEqual(xpath_attr(doc, 'div/bar', 'x', default='default'), 'default') +        self.assertEqual(xpath_attr(doc, 'div/p', 'y', default='default'), 'default') +        self.assertRaises(ExtractorError, xpath_attr, doc, 'div/bar', 'x', fatal=True) +        self.assertRaises(ExtractorError, xpath_attr, doc, 'div/p', 'y', fatal=True) +      def test_smuggle_url(self):          data = {"ö": "ö", "abc": [3]}          url = 'https://foo.bar/baz?x=y#a' @@ -646,6 +677,51 @@ The first line  '''          self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) +    def test_cli_option(self): +        self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) +        self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) +        self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) + +    def test_cli_valueless_option(self): +        self.assertEqual(cli_valueless_option( +            {'downloader': 'external'}, '--external-downloader', 'downloader', 'external'), ['--external-downloader']) +        self.assertEqual(cli_valueless_option( +            {'downloader': 'internal'}, '--external-downloader', 'downloader', 'external'), []) +        self.assertEqual(cli_valueless_option( +            {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), ['--no-check-certificate']) +        self.assertEqual(cli_valueless_option( +            {'nocheckcertificate': False}, '--no-check-certificate', 'nocheckcertificate'), []) +        self.assertEqual(cli_valueless_option( +            {'checkcertificate': True}, '--no-check-certificate', 'checkcertificate', False), []) +        self.assertEqual(cli_valueless_option( +            {'checkcertificate': False}, '--no-check-certificate', 'checkcertificate', False), ['--no-check-certificate']) + +    def test_cli_bool_option(self): +        self.assertEqual( +            cli_bool_option( +                {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate'), +            ['--no-check-certificate', 'true']) +        self.assertEqual( +            cli_bool_option( +                {'nocheckcertificate': True}, '--no-check-certificate', 'nocheckcertificate', separator='='), +            ['--no-check-certificate=true']) +        self.assertEqual( +            cli_bool_option( +                {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), +            ['--check-certificate', 'false']) +        self.assertEqual( +            cli_bool_option( +                {'nocheckcertificate': True}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), +            ['--check-certificate=false']) +        self.assertEqual( +            cli_bool_option( +                {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true'), +            ['--check-certificate', 'true']) +        self.assertEqual( +            cli_bool_option( +                {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), +            ['--check-certificate=true']) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index cad6b026e..982e658ce 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2009,7 +2009,7 @@ class YoutubeDL(object):                                 (info_dict['extractor'], info_dict['id'], thumb_display_id))                  try:                      uf = self.urlopen(t['url']) -                    with open(thumb_filename, 'wb') as thumbf: +                    with open(encodeFilename(thumb_filename), 'wb') as thumbf:                          shutil.copyfileobj(uf, thumbf)                      self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %                                     (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 6c310346c..2bc011266 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -5,6 +5,10 @@ import subprocess  from .common import FileDownloader  from ..utils import ( +    cli_option, +    cli_valueless_option, +    cli_bool_option, +    cli_configuration_args,      encodeFilename,      encodeArgument,  ) @@ -46,19 +50,16 @@ class ExternalFD(FileDownloader):          return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps')      def _option(self, command_option, param): -        param = self.params.get(param) -        if param is None: -            return [] -        if isinstance(param, bool): -            return [command_option] -        return [command_option, param] +        return cli_option(self.params, command_option, param) + +    def _bool_option(self, command_option, param, true_value='true', false_value='false', separator=None): +        return cli_bool_option(self.params, command_option, param, true_value, false_value, separator) + +    def _valueless_option(self, command_option, param, expected_value=True): +        return cli_valueless_option(self.params, command_option, param, expected_value)      def _configuration_args(self, default=[]): -        ex_args = self.params.get('external_downloader_args') -        if ex_args is None: -            return default -        assert isinstance(ex_args, list) -        return ex_args +        return cli_configuration_args(self.params, 'external_downloader_args', default)      def _call_downloader(self, tmpfilename, info_dict):          """ Either overwrite this or implement _make_cmd """ @@ -80,6 +81,8 @@ class CurlFD(ExternalFD):          for key, val in info_dict['http_headers'].items():              cmd += ['--header', '%s: %s' % (key, val)]          cmd += self._option('--interface', 'source_address') +        cmd += self._option('--proxy', 'proxy') +        cmd += self._valueless_option('--insecure', 'nocheckcertificate')          cmd += self._configuration_args()          cmd += ['--', info_dict['url']]          return cmd @@ -102,7 +105,7 @@ class WgetFD(ExternalFD):              cmd += ['--header', '%s: %s' % (key, val)]          cmd += self._option('--bind-address', 'source_address')          cmd += self._option('--proxy', 'proxy') -        cmd += self._option('--no-check-certificate', 'nocheckcertificate') +        cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')          cmd += self._configuration_args()          cmd += ['--', info_dict['url']]          return cmd @@ -121,6 +124,7 @@ class Aria2cFD(ExternalFD):              cmd += ['--header', '%s: %s' % (key, val)]          cmd += self._option('--interface', 'source_address')          cmd += self._option('--all-proxy', 'proxy') +        cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=')          cmd += ['--', info_dict['url']]          return cmd diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 275564b59..174180db5 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -13,6 +13,8 @@ from ..compat import (      compat_urllib_error,  )  from ..utils import ( +    encodeFilename, +    sanitize_open,      struct_pack,      struct_unpack,      xpath_text, @@ -343,18 +345,19 @@ class F4mFD(FragmentFD):                  success = ctx['dl'].download(frag_filename, {'url': url})                  if not success:                      return False -                with open(frag_filename, 'rb') as down: -                    down_data = down.read() -                    reader = FlvReader(down_data) -                    while True: -                        _, box_type, box_data = reader.read_box_info() -                        if box_type == b'mdat': -                            dest_stream.write(box_data) -                            break +                (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') +                down_data = down.read() +                down.close() +                reader = FlvReader(down_data) +                while True: +                    _, box_type, box_data = reader.read_box_info() +                    if box_type == b'mdat': +                        dest_stream.write(box_data) +                        break                  if live: -                    os.remove(frag_filename) +                    os.remove(encodeFilename(frag_sanitized))                  else: -                    frags_filenames.append(frag_filename) +                    frags_filenames.append(frag_sanitized)              except (compat_urllib_error.HTTPError, ) as err:                  if live and (err.code == 404 or err.code == 410):                      # We didn't keep up with the live window. Continue @@ -375,6 +378,6 @@ class F4mFD(FragmentFD):          self._finish_frag_download(ctx)          for frag_file in frags_filenames: -            os.remove(frag_file) +            os.remove(encodeFilename(frag_file))          return True diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 2b6c3370f..71aafdc73 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -12,6 +12,7 @@ from ..postprocessor.ffmpeg import FFmpegPostProcessor  from ..utils import (      encodeArgument,      encodeFilename, +    sanitize_open,  ) @@ -89,13 +90,13 @@ class NativeHlsFD(FragmentFD):              success = ctx['dl'].download(frag_filename, {'url': frag_url})              if not success:                  return False -            with open(frag_filename, 'rb') as down: -                ctx['dest_stream'].write(down.read()) -            frags_filenames.append(frag_filename) +            down, frag_sanitized = sanitize_open(frag_filename, 'rb') +            ctx['dest_stream'].write(down.read()) +            frags_filenames.append(frag_sanitized)          self._finish_frag_download(ctx)          for frag_file in frags_filenames: -            os.remove(frag_file) +            os.remove(encodeFilename(frag_file))          return True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6bee5b63c..5d2ea39d0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -241,7 +241,10 @@ from .imdb import (      ImdbIE,      ImdbListIE  ) -from .imgur import ImgurIE +from .imgur import ( +    ImgurIE, +    ImgurAlbumIE, +)  from .ina import InaIE  from .indavideo import (      IndavideoIE, @@ -340,6 +343,7 @@ from .mtv import (      MTVIE,      MTVServicesEmbeddedIE,      MTVIggyIE, +    MTVDEIE,  )  from .muenchentv import MuenchenTVIE  from .musicplayon import MusicPlayOnIE @@ -454,6 +458,10 @@ from .playfm import PlayFMIE  from .playtvak import PlaytvakIE  from .playvid import PlayvidIE  from .playwire import PlaywireIE +from .pluralsight import ( +    PluralsightIE, +    PluralsightCourseIE, +)  from .podomatic import PodomaticIE  from .porn91 import Porn91IE  from .pornhd import PornHdIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index dc0fb85d6..f9a389f67 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -1,16 +1,20 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +    js_to_json, +    int_or_none, +)  class ABCIE(InfoExtractor):      IE_NAME = 'abc.net.au'      _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334',          'md5': 'cb3dd03b18455a661071ee1e28344d9f',          'info_dict': { @@ -19,22 +23,47 @@ class ABCIE(InfoExtractor):              'title': 'Australia to help staff Ebola treatment centre in Sierra Leone',              'description': 'md5:809ad29c67a05f54eb41f2a105693a67',          }, -    } +    }, { +        'url': 'http://www.abc.net.au/news/2015-08-17/warren-entsch-introduces-same-sex-marriage-bill/6702326', +        'md5': 'db2a5369238b51f9811ad815b69dc086', +        'info_dict': { +            'id': 'NvqvPeNZsHU', +            'ext': 'mp4', +            'upload_date': '20150816', +            'uploader': 'ABC News (Australia)', +            'description': 'Government backbencher Warren Entsch introduces a cross-party sponsored bill to legalise same-sex marriage, saying the bill is designed to promote "an inclusive Australia, not a divided one.". Read more here: http://ab.co/1Mwc6ef', +            'uploader_id': 'NewsOnABC', +            'title': 'Marriage Equality: Warren Entsch introduces same sex marriage bill', +        }, +        'add_ie': ['Youtube'], +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        urls_info_json = self._search_regex( -            r'inlineVideoData\.push\((.*?)\);', webpage, 'video urls', -            flags=re.DOTALL) -        urls_info = json.loads(urls_info_json.replace('\'', '"')) +        mobj = re.search( +            r'inline(?P<type>Video|YouTube)Data\.push\((?P<json_data>[^)]+)\);', +            webpage) +        if mobj is None: +            raise ExtractorError('Unable to extract video urls') + +        urls_info = self._parse_json( +            mobj.group('json_data'), video_id, transform_source=js_to_json) + +        if not isinstance(urls_info, list): +            urls_info = [urls_info] + +        if mobj.group('type') == 'YouTube': +            return self.playlist_result([ +                self.url_result(url_info['url']) for url_info in urls_info]) +          formats = [{              'url': url_info['url'], -            'width': int(url_info['width']), -            'height': int(url_info['height']), -            'tbr': int(url_info['bitrate']), -            'filesize': int(url_info['filesize']), +            'width': int_or_none(url_info.get('width')), +            'height': int_or_none(url_info.get('height')), +            'tbr': int_or_none(url_info.get('bitrate')), +            'filesize': int_or_none(url_info.get('filesize')),          } for url_info in urls_info]          self._sort_formats(formats) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index c949a4814..fd1770dac 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -6,6 +6,7 @@ import re  from .common import InfoExtractor  from ..utils import ExtractorError  from .bliptv import BlipTVIE +from .screenwavemedia import ScreenwaveMediaIE  class CinemassacreIE(InfoExtractor): @@ -83,10 +84,10 @@ class CinemassacreIE(InfoExtractor):          playerdata_url = self._search_regex(              [ -                r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', -                r'<iframe[^>]+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', +                ScreenwaveMediaIE.EMBED_PATTERN, +                r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',              ], -            webpage, 'player data URL', default=None) +            webpage, 'player data URL', default=None, group='url')          if not playerdata_url:              playerdata_url = BlipTVIE._extract_url(webpage)          if not playerdata_url: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ce2030d28..39cef9c5b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -510,6 +510,12 @@ class InfoExtractor(object):          """Report attempt to log in."""          self.to_screen('Logging in') +    @staticmethod +    def raise_login_required(msg='This video is only available for registered users'): +        raise ExtractorError( +            '%s. Use --username and --password or --netrc to provide account credentials.' % msg, +            expected=True) +      # Methods for following #608      @staticmethod      def url_result(url, ie=None, video_id=None, video_title=None): @@ -1151,7 +1157,7 @@ class InfoExtractor(object):                  }                  if type_ in SUBTITLES_TYPES:                      ext = SUBTITLES_TYPES[type_] -            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or subtitles_lang +            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang              subtitles.setdefault(lang, []).append({                  'url': src,                  'ext': ext, @@ -1279,6 +1285,23 @@ class InfoExtractor(object):      def _get_subtitles(self, *args, **kwargs):          raise NotImplementedError("This method must be implemented by subclasses") +    @staticmethod +    def _merge_subtitle_items(subtitle_list1, subtitle_list2): +        """ Merge subtitle items for one language. Items with duplicated URLs +        will be dropped. """ +        list1_urls = set([item['url'] for item in subtitle_list1]) +        ret = list(subtitle_list1) +        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) +        return ret + +    @classmethod +    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): +        """ Merge two subtitle dictionaries, language by language. """ +        ret = dict(subtitle_dict1) +        for lang in subtitle_dict2: +            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) +        return ret +      def extract_automatic_captions(self, *args, **kwargs):          if (self._downloader.params.get('writeautomaticsub', False) or                  self._downloader.params.get('listsubtitles')): diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 33a033a7f..ce123482e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -20,9 +20,11 @@ from ..utils import (      ExtractorError,      bytes_to_intlist,      intlist_to_bytes, +    int_or_none,      remove_end,      unified_strdate,      urlencode_postdata, +    xpath_text,  )  from ..aes import (      aes_cbc_decrypt, @@ -237,7 +239,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              webpage_url = 'http://www.' + mobj.group('url')          webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') -        note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') +        note_m = self._html_search_regex( +            r'<div class="showmedia-trailer-notice">(.+?)</div>', +            webpage, 'trailer-notice', default='')          if note_m:              raise ExtractorError(note_m) @@ -247,6 +251,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              if msg.get('type') == 'error':                  raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) +        if 'To view this, please log in to verify you are 18 or older.' in webpage: +            self.raise_login_required() +          video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)          video_title = re.sub(r' {2,}', ' ', video_title)          video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') @@ -281,6 +288,13 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text              stream_info = streamdata.find('./{default}preload/stream_info')              video_url = stream_info.find('./host').text              video_play_path = stream_info.find('./file').text +            metadata = stream_info.find('./metadata') +            format_info = { +                'format': video_format, +                'format_id': video_format, +                'height': int_or_none(xpath_text(metadata, './height')), +                'width': int_or_none(xpath_text(metadata, './width')), +            }              if '.fplive.net/' in video_url:                  video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) @@ -289,19 +303,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text                      netloc='v.lvlt.crcdn.net',                      path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1])))                  if self._is_valid_url(direct_video_url, video_id, video_format): -                    formats.append({ +                    format_info.update({                          'url': direct_video_url, -                        'format_id': video_format,                      }) +                    formats.append(format_info)                      continue -            formats.append({ +            format_info.update({                  'url': video_url,                  'play_path': video_play_path,                  'ext': 'flv', -                'format': video_format, -                'format_id': video_format,              }) +            formats.append(format_info)          subtitles = self.extract_subtitles(video_id, webpage) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index 999fb5620..1f00386fe 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -9,8 +9,8 @@ from ..utils import qualities  class DumpertIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/(?:mediabase|embed)/(?P<id>[0-9]+/[0-9a-zA-Z]+)' +    _TESTS = [{          'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/',          'md5': '1b9318d7d5054e7dcb9dc7654f21d643',          'info_dict': { @@ -20,11 +20,15 @@ class DumpertIE(InfoExtractor):              'description': 'Niet schrikken hoor',              'thumbnail': 're:^https?://.*\.jpg$',          } -    } +    }, { +        'url': 'http://www.dumpert.nl/embed/6675421/dc440fe7/', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) +        url = 'https://www.dumpert.nl/mediabase/' + video_id          req = compat_urllib_request.Request(url)          req.add_header('Cookie', 'nsfw=1; cpc=10')          webpage = self._download_webpage(req, video_id) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 316033cf1..7fcd0151d 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -71,8 +71,7 @@ class EroProfileIE(InfoExtractor):          m = re.search(r'You must be logged in to view this video\.', webpage)          if m: -            raise ExtractorError( -                'This video requires login. Please specify a username and password and try again.', expected=True) +            self.raise_login_required('This video requires login')          video_id = self._search_regex(              [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index e4f7195a8..5c1137e94 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -15,7 +15,7 @@ from ..utils import (  class FC2IE(InfoExtractor): -    _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)?content/(?P<id>[^/]+)' +    _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)'      IE_NAME = 'fc2'      _NETRC_MACHINE = 'fc2'      _TESTS = [{ @@ -37,6 +37,9 @@ class FC2IE(InfoExtractor):              'password': '(snip)',              'skip': 'requires actual password'          } +    }, { +        'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF', +        'only_matching': True,      }]      def _login(self): @@ -80,7 +83,7 @@ class FC2IE(InfoExtractor):          title = self._og_search_title(webpage)          thumbnail = self._og_search_thumbnail(webpage) -        refer = url.replace('/content/', '/a/content/') +        refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url          mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py index 0fb29de75..75399fa7d 100644 --- a/youtube_dl/extractor/folketinget.py +++ b/youtube_dl/extractor/folketinget.py @@ -30,6 +30,10 @@ class FolketingetIE(InfoExtractor):              'upload_date': '20141120',              'duration': 3960,          }, +        'params': { +            # rtmp download +            'skip_download': True, +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 917f76b1e..3a4a59135 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -1,5 +1,7 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..utils import (      parse_iso8601, @@ -8,7 +10,8 @@ from ..utils import (  class FoxNewsIE(InfoExtractor): -    _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' +    IE_DESC = 'Fox News and Fox Business Video' +    _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'      _TESTS = [          {              'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -42,13 +45,19 @@ class FoxNewsIE(InfoExtractor):              'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com',              'only_matching': True,          }, +        { +            'url': 'http://video.foxbusiness.com/v/4442309889001', +            'only_matching': True, +        },      ]      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        host = mobj.group('host')          video = self._download_json( -            'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id) +            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id)          item = video['channel']['item']          title = item['title'] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 75723c00d..129984a5f 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -78,9 +78,14 @@ class FranceTVBaseInfoExtractor(InfoExtractor):                  })          self._sort_formats(formats) +        title = info['titre'] +        subtitle = info.get('sous_titre') +        if subtitle: +            title += ' - %s' % subtitle +          return {              'id': video_id, -            'title': info['titre'], +            'title': title,              'description': clean_html(info['synopsis']),              'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),              'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), @@ -214,15 +219,15 @@ class FranceTVIE(FranceTVBaseInfoExtractor):          },          # france5          { -            'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968', -            'md5': '78f0f4064f9074438e660785bbf2c5d9', +            'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', +            'md5': 'f6c577df3806e26471b3d21631241fd0',              'info_dict': { -                'id': '108961659', +                'id': '123327454',                  'ext': 'flv', -                'title': 'C à dire ?!', -                'description': 'md5:1a4aeab476eb657bf57c4ff122129f81', -                'upload_date': '20140915', -                'timestamp': 1410795000, +                'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', +                'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', +                'upload_date': '20150831', +                'timestamp': 1441035120,              },          },          # franceo diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 376feecae..953ec32c3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,7 @@ from .vimeo import VimeoIE  from .dailymotion import DailymotionCloudIE  from .onionstudios import OnionStudiosIE  from .snagfilms import SnagFilmsEmbedIE +from .screenwavemedia import ScreenwaveMediaIE  class GenericIE(InfoExtractor): @@ -1001,6 +1002,16 @@ class GenericIE(InfoExtractor):                  'description': 'New experience with Acrobat DC',                  'duration': 248.667,              }, +        }, +        # ScreenwaveMedia embed +        { +            'url': 'http://www.thecinemasnob.com/the-cinema-snob/a-nightmare-on-elm-street-2-freddys-revenge1', +            'md5': '24ace5baba0d35d55c6810b51f34e9e0', +            'info_dict': { +                'id': 'cinemasnob-55d26273809dd', +                'ext': 'mp4', +                'title': 'cinemasnob', +            },          }      ] @@ -1718,6 +1729,11 @@ class GenericIE(InfoExtractor):          if snagfilms_url:              return self.url_result(snagfilms_url) +        # Look for ScreenwaveMedia embeds +        mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage) +        if mobj is not None: +            return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') +          # Look for AdobeTVVideo embeds          mobj = re.search(              r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8a95793ca..33d6432a6 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -13,6 +13,7 @@ from ..compat import (  from ..utils import (      ExtractorError,      float_or_none, +    int_or_none,  ) @@ -359,13 +360,8 @@ class GloboIE(InfoExtractor):              self._API_URL_TEMPLATE % video_id, video_id)['videos'][0]          title = video['title'] -        duration = float_or_none(video['duration'], 1000) -        like_count = video['likes'] -        uploader = video['channel'] -        uploader_id = video['channel_id']          formats = [] -          for resource in video['resources']:              resource_id = resource.get('_id')              if not resource_id: @@ -407,6 +403,11 @@ class GloboIE(InfoExtractor):          self._sort_formats(formats) +        duration = float_or_none(video.get('duration'), 1000) +        like_count = int_or_none(video.get('likes')) +        uploader = video.get('channel') +        uploader_id = video.get('channel_id') +          return {              'id': video_id,              'title': title, diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index d692ea79a..70c8ca64e 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import (  class ImgurIE(InfoExtractor): -    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)' +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P<id>[a-zA-Z0-9]+)'      _TESTS = [{          'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -97,3 +97,28 @@ class ImgurIE(InfoExtractor):              'description': self._og_search_description(webpage),              'title': self._og_search_title(webpage),          } + + +class ImgurAlbumIE(InfoExtractor): +    _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P<id>[a-zA-Z0-9]+)' + +    _TEST = { +        'url': 'http://imgur.com/gallery/Q95ko', +        'info_dict': { +            'id': 'Q95ko', +        }, +        'playlist_count': 25, +    } + +    def _real_extract(self, url): +        album_id = self._match_id(url) + +        album_images = self._download_json( +            'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, +            album_id)['data']['images'] + +        entries = [ +            self.url_result('http://imgur.com/%s' % image['hash']) +            for image in album_images if image.get('hash')] + +        return self.playlist_result(entries, album_id) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index d28730492..3dca0e566 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -13,12 +13,24 @@ from ..utils import (  class KalturaIE(InfoExtractor):      _VALID_URL = r'''(?x) -    (?:kaltura:| -       https?://(:?(?:www|cdnapisec)\.)?kaltura\.com/index\.php/kwidget/(?:[^/]+/)*?wid/_ -    )(?P<partner_id>\d+) -    (?::| -       /(?:[^/]+/)*?entry_id/ -    )(?P<id>[0-9a-z_]+)''' +                (?: +                    kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)| +                    https?:// +                        (:?(?:www|cdnapisec)\.)?kaltura\.com/ +                        (?: +                            (?: +                                # flash player +                                index\.php/kwidget/ +                                (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/ +                                (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)| +                                # html5 player +                                html5/html5lib/ +                                (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+) +                                .*\?.*\bwid=_(?P<partner_id_html5>\d+) +                            ) +                        ) +                ) +                '''      _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'      _TESTS = [          { @@ -43,6 +55,10 @@ class KalturaIE(InfoExtractor):              'url': 'https://cdnapisec.kaltura.com/index.php/kwidget/wid/_557781/uiconf_id/22845202/entry_id/1_plr1syf3',              'only_matching': True,          }, +        { +            'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', +            'only_matching': True, +        }      ]      def _kaltura_api_call(self, video_id, actions, *args, **kwargs): @@ -105,9 +121,9 @@ class KalturaIE(InfoExtractor):              video_id, actions, note='Downloading video info JSON')      def _real_extract(self, url): -        video_id = self._match_id(url)          mobj = re.match(self._VALID_URL, url) -        partner_id, entry_id = mobj.group('partner_id'), mobj.group('id') +        partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') +        entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5')          info, source_data = self._get_video_info(entry_id, partner_id) @@ -126,7 +142,7 @@ class KalturaIE(InfoExtractor):          self._sort_formats(formats)          return { -            'id': video_id, +            'id': entry_id,              'title': info['name'],              'formats': formats,              'description': info.get('description'), diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 720bc939b..a59c529f4 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( +    int_or_none, +    parse_duration, +)  class KontrTubeIE(InfoExtractor): @@ -34,33 +37,28 @@ class KontrTubeIE(InfoExtractor):          webpage = self._download_webpage(              url, display_id, 'Downloading page') -        video_url = self._html_search_regex( +        video_url = self._search_regex(              r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL') -        thumbnail = self._html_search_regex( -            r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False) +        thumbnail = self._search_regex( +            r"preview_url\s*:\s*'(.+?)/?',", webpage, 'thumbnail', fatal=False)          title = self._html_search_regex( -            r'<title>(.+?)</title>', webpage, 'video title') +            r'(?s)<h2>(.+?)</h2>', webpage, 'title')          description = self._html_search_meta( -            'description', webpage, 'video description') +            'description', webpage, 'description') -        mobj = re.search( -            r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', -            webpage) -        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None +        duration = self._search_regex( +            r'Длительность: <em>([^<]+)</em>', webpage, 'duration', fatal=False) +        if duration: +            duration = parse_duration(duration.replace('мин', 'min').replace('сек', 'sec')) -        view_count = self._html_search_regex( -            r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', +        view_count = self._search_regex( +            r'Просмотров: <em>([^<]+)</em>',              webpage, 'view count', fatal=False) +        if view_count: +            view_count = int_or_none(view_count.replace(' ', '')) -        comment_count = None -        comment_str = self._html_search_regex( -            r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False) -        if comment_str.startswith('комментариев нет'): -            comment_count = 0 -        else: -            mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str) -            if mobj: -                comment_count = mobj.group('total') +        comment_count = int_or_none(self._search_regex( +            r'Комментарии \((\d+)\)<', webpage, ' comment count', fatal=False))          return {              'id': video_id, diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index 96f95979a..0ae8ebd68 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -25,6 +25,9 @@ class KrasViewIE(InfoExtractor):              'duration': 27,              'thumbnail': 're:^https?://.*\.jpg',          }, +        'params': { +            'skip_download': 'Not accessible from Travis CI server', +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5b9157ed4..378117270 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -118,9 +118,7 @@ class LyndaIE(LyndaBaseIE):                  'lynda returned error: %s' % video_json['Message'], expected=True)          if video_json['HasAccess'] is False: -            raise ExtractorError( -                'Video %s is only available for members. ' -                % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True) +            self.raise_login_required('Video %s is only available for members' % video_id)          video_id = compat_str(video_json['ID'])          duration = video_json['DurationInSeconds'] diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 54a14cb94..ab1300185 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -25,6 +25,7 @@ class MailRuIE(InfoExtractor):                  'uploader_id': 'sonypicturesrus@mail.ru',                  'duration': 184,              }, +            'skip': 'Not accessible from Travis CI server',          },          {              'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', @@ -39,6 +40,7 @@ class MailRuIE(InfoExtractor):                  'uploader_id': 'hitech@corp.mail.ru',                  'duration': 245,              }, +            'skip': 'Not accessible from Travis CI server',          },      ] diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index b48fac5e3..a597714e9 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -67,7 +67,7 @@ class MTVServicesInfoExtractor(InfoExtractor):          return [{'url': url, 'ext': 'mp4'}]      def _extract_video_formats(self, mdoc, mtvn_id): -        if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None: +        if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$', mdoc.find('.//src').text) is not None:              if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:                  self.to_screen('The normal version is not available from your '                                 'country, trying with the mobile version') @@ -114,7 +114,8 @@ class MTVServicesInfoExtractor(InfoExtractor):          # Remove the templates, like &device={device}          mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)          if 'acceptMethods' not in mediagen_url: -            mediagen_url += '&acceptMethods=fms' +            mediagen_url += '&' if '?' in mediagen_url else '?' +            mediagen_url += 'acceptMethods=fms'          mediagen_doc = self._download_xml(mediagen_url, video_id,                                            'Downloading video urls') @@ -141,7 +142,7 @@ class MTVServicesInfoExtractor(InfoExtractor):          if title_el is None:              title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title')          if title_el is None: -            title_el = itemdoc.find('.//title') +            title_el = itemdoc.find('.//title') or itemdoc.find('./title')              if title_el.text is None:                  title_el = None @@ -174,8 +175,11 @@ class MTVServicesInfoExtractor(InfoExtractor):          if self._LANG:              info_url += 'lang=%s&' % self._LANG          info_url += data +        return self._get_videos_info_from_url(info_url, video_id) + +    def _get_videos_info_from_url(self, url, video_id):          idoc = self._download_xml( -            info_url, video_id, +            url, video_id,              'Downloading info', transform_source=fix_xml_ampersands)          return self.playlist_result(              [self._get_video_info(item) for item in idoc.findall('.//item')]) @@ -288,3 +292,65 @@ class MTVIggyIE(MTVServicesInfoExtractor):          }      }      _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' + + +class MTVDEIE(MTVServicesInfoExtractor): +    IE_NAME = 'mtv.de' +    _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$' +    _TESTS = [{ +        'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', +        'info_dict': { +            'id': 'music_video-a50bc5f0b3aa4b3190aa', +            'ext': 'mp4', +            'title': 'MusicVideo_cro-traum', +            'description': 'Cro - Traum', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    }, { +        # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) +        'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', +        'info_dict': { +            'id': 'local_playlist-f5ae778b9832cc837189', +            'ext': 'mp4', +            'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    }, { +        # single video in pagePlaylist with different id +        'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', +        'info_dict': { +            'id': 'local_playlist-4e760566473c4c8c5344', +            'ext': 'mp4', +            'title': 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1', +            'description': 'MTV Movies Supercut', +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        }, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        playlist = self._parse_json( +            self._search_regex( +                r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), +            video_id) + +        # news pages contain single video in playlist with different id +        if len(playlist) == 1: +            return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + +        for item in playlist: +            item_id = item.get('id') +            if item_id and compat_str(item_id) == video_id: +                return self._get_videos_info_from_url(item['mrss'], video_id) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 66c627bec..c8257719f 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -130,10 +130,16 @@ class NowTVIE(InfoExtractor):      }, {          'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit',          'only_matching': True, +    }, { +        'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', +        'only_matching': True,      }]      def _real_extract(self, url):          display_id = self._match_id(url) +        display_id_split = display_id.split('/') +        if len(display_id) > 2: +            display_id = '/'.join((display_id_split[0], display_id_split[-1]))          info = self._download_json(              'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py index dec09cdfe..17baa9679 100644 --- a/youtube_dl/extractor/nowvideo.py +++ b/youtube_dl/extractor/nowvideo.py @@ -7,7 +7,7 @@ class NowVideoIE(NovaMovIE):      IE_NAME = 'nowvideo'      IE_DESC = 'NowVideo' -    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co|li)'} +    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|ec|sx|eu|at|ag|co|li)'}      _HOST = 'www.nowvideo.ch' diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 003d27de7..66520c2c5 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -12,7 +12,7 @@ from ..utils import (  class OdnoklassnikiIE(InfoExtractor): -    _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)' +    _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P<id>[\d-]+)'      _TESTS = [{          # metadata in JSON          'url': 'http://ok.ru/video/20079905452', @@ -44,8 +44,26 @@ class OdnoklassnikiIE(InfoExtractor):              'age_limit': 0,          },      }, { +        # YouTube embed (metadataUrl, provider == USER_YOUTUBE) +        'url': 'http://ok.ru/video/64211978996595-1', +        'md5': '5d7475d428845cd2e13bae6f1a992278', +        'info_dict': { +            'id': '64211978996595-1', +            'ext': 'mp4', +            'title': 'Космическая среда от 26 августа 2015', +            'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', +            'duration': 440, +            'upload_date': '20150826', +            'uploader_id': '750099571', +            'uploader': 'Алина П', +            'age_limit': 0, +        }, +    }, {          'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',          'only_matching': True, +    }, { +        'url': 'http://www.ok.ru/video/20648036891', +        'only_matching': True,      }]      def _real_extract(self, url): @@ -56,7 +74,8 @@ class OdnoklassnikiIE(InfoExtractor):          player = self._parse_json(              unescapeHTML(self._search_regex( -                r'data-attributes="([^"]+)"', webpage, 'player')), +                r'data-options=(?P<quote>["\'])(?P<player>{.+?%s.+?})(?P=quote)' % video_id, +                webpage, 'player', group='player')),              video_id)          flashvars = player['flashvars'] @@ -89,16 +108,7 @@ class OdnoklassnikiIE(InfoExtractor):          like_count = int_or_none(metadata.get('likeCount')) -        quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd')) - -        formats = [{ -            'url': f['url'], -            'ext': 'mp4', -            'format_id': f['name'], -            'quality': quality(f['name']), -        } for f in metadata['videos']] - -        return { +        info = {              'id': video_id,              'title': title,              'thumbnail': thumbnail, @@ -108,5 +118,24 @@ class OdnoklassnikiIE(InfoExtractor):              'uploader_id': uploader_id,              'like_count': like_count,              'age_limit': age_limit, -            'formats': formats,          } + +        if metadata.get('provider') == 'USER_YOUTUBE': +            info.update({ +                '_type': 'url_transparent', +                'url': movie['contentId'], +            }) +            return info + +        quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd')) + +        formats = [{ +            'url': f['url'], +            'ext': 'mp4', +            'format_id': f['name'], +            'quality': quality(f['name']), +        } for f in metadata['videos']] +        self._sort_formats(formats) + +        info['formats'] = formats +        return info diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py new file mode 100644 index 000000000..fd32836cc --- /dev/null +++ b/youtube_dl/extractor/pluralsight.py @@ -0,0 +1,207 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( +    compat_str, +    compat_urllib_parse, +    compat_urllib_request, +    compat_urlparse, +) +from ..utils import ( +    ExtractorError, +    int_or_none, +    parse_duration, +) + + +class PluralsightIE(InfoExtractor): +    IE_NAME = 'pluralsight' +    _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/training/player\?author=(?P<author>[^&]+)&name=(?P<name>[^&]+)(?:&mode=live)?&clip=(?P<clip>\d+)&course=(?P<course>[^&]+)' +    _LOGIN_URL = 'https://www.pluralsight.com/id/' +    _NETRC_MACHINE = 'pluralsight' + +    _TEST = { +        'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', +        'md5': '4d458cf5cf4c593788672419a8dd4cf8', +        'info_dict': { +            'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', +            'ext': 'mp4', +            'title': 'Management of SQL Server - Demo Monitoring', +            'duration': 338, +        }, +        'skip': 'Requires pluralsight account credentials', +    } + +    def _real_initialize(self): +        self._login() + +    def _login(self): +        (username, password) = self._get_login_info() +        if username is None: +            self.raise_login_required('Pluralsight account is required') + +        login_page = self._download_webpage( +            self._LOGIN_URL, None, 'Downloading login page') + +        login_form = self._hidden_inputs(login_page) + +        login_form.update({ +            'Username': username.encode('utf-8'), +            'Password': password.encode('utf-8'), +        }) + +        post_url = self._search_regex( +            r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, +            'post url', default=self._LOGIN_URL, group='url') + +        if not post_url.startswith('http'): +            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + +        request = compat_urllib_request.Request( +            post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) +        request.add_header('Content-Type', 'application/x-www-form-urlencoded') + +        response = self._download_webpage( +            request, None, 'Logging in as %s' % username) + +        error = self._search_regex( +            r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', +            response, 'error message', default=None) +        if error: +            raise ExtractorError('Unable to login: %s' % error, expected=True) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        author = mobj.group('author') +        name = mobj.group('name') +        clip_id = mobj.group('clip') +        course = mobj.group('course') + +        display_id = '%s-%s' % (name, clip_id) + +        webpage = self._download_webpage(url, display_id) + +        collection = self._parse_json( +            self._search_regex( +                r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)', +                webpage, 'modules'), +            display_id) + +        module, clip = None, None + +        for module_ in collection: +            if module_.get('moduleName') == name: +                module = module_ +                for clip_ in module_.get('clips', []): +                    clip_index = clip_.get('clipIndex') +                    if clip_index is None: +                        continue +                    if compat_str(clip_index) == clip_id: +                        clip = clip_ +                        break + +        if not clip: +            raise ExtractorError('Unable to resolve clip') + +        QUALITIES = { +            'low': {'width': 640, 'height': 480}, +            'medium': {'width': 848, 'height': 640}, +            'high': {'width': 1024, 'height': 768}, +        } + +        ALLOWED_QUALITIES = ( +            ('webm', ('high',)), +            ('mp4', ('low', 'medium', 'high',)), +        ) + +        formats = [] +        for ext, qualities in ALLOWED_QUALITIES: +            for quality in qualities: +                f = QUALITIES[quality].copy() +                clip_post = { +                    'a': author, +                    'cap': 'false', +                    'cn': clip_id, +                    'course': course, +                    'lc': 'en', +                    'm': name, +                    'mt': ext, +                    'q': '%dx%d' % (f['width'], f['height']), +                } +                request = compat_urllib_request.Request( +                    'http://www.pluralsight.com/training/Player/ViewClip', +                    json.dumps(clip_post).encode('utf-8')) +                request.add_header('Content-Type', 'application/json;charset=utf-8') +                format_id = '%s-%s' % (ext, quality) +                clip_url = self._download_webpage( +                    request, display_id, 'Downloading %s URL' % format_id, fatal=False) +                if not clip_url: +                    continue +                f.update({ +                    'url': clip_url, +                    'ext': ext, +                    'format_id': format_id, +                }) +                formats.append(f) +        self._sort_formats(formats) + +        # TODO: captions +        # http://www.pluralsight.com/training/Player/ViewClip + cap = true +        # or +        # http://www.pluralsight.com/training/Player/Captions +        # { a = author, cn = clip_id, lc = end, m = name } + +        return { +            'id': clip['clipName'], +            'title': '%s - %s' % (module['title'], clip['title']), +            'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), +            'creator': author, +            'formats': formats +        } + + +class PluralsightCourseIE(InfoExtractor): +    IE_NAME = 'pluralsight:course' +    _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/courses/(?P<id>[^/]+)' +    _TEST = { +        # Free course from Pluralsight Starter Subscription for Microsoft TechNet +        # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz +        'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', +        'info_dict': { +            'id': 'hosting-sql-server-windows-azure-iaas', +            'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', +            'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', +        }, +        'playlist_count': 31, +    } + +    def _real_extract(self, url): +        course_id = self._match_id(url) + +        # TODO: PSM cookie + +        course = self._download_json( +            'http://www.pluralsight.com/data/course/%s' % course_id, +            course_id, 'Downloading course JSON') + +        title = course['title'] +        description = course.get('description') or course.get('shortDescription') + +        course_data = self._download_json( +            'http://www.pluralsight.com/data/course/content/%s' % course_id, +            course_id, 'Downloading course data JSON') + +        entries = [] +        for module in course_data: +            for clip in module.get('clips', []): +                player_parameters = clip.get('playerParameters') +                if not player_parameters: +                    continue +                entries.append(self.url_result( +                    'http://www.pluralsight.com/training/player?%s' % player_parameters, +                    'Pluralsight')) + +        return self.playlist_result(entries, course_id, title, description) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 72cd80498..25f7faf76 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -1,6 +1,7 @@  # encoding: utf-8  from __future__ import unicode_literals +import re  from .common import InfoExtractor @@ -8,22 +9,28 @@ class RTL2IE(InfoExtractor):      _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))'      _TESTS = [{          'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', -        'md5': 'bfcc179030535b08dc2b36b469b5adc7',          'info_dict': {              'id': 'folge-203-0',              'ext': 'f4v',              'title': 'GRIP sucht den Sommerkönig',              'description': 'Matthias, Det und Helge treten gegeneinander an.'          }, +        'params': { +            # rtmp download +            'skip_download': True, +        },      }, {          'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', -        'md5': 'ffcd517d2805b57ce11a58a2980c2b02',          'info_dict': {              'id': '21040-anna-erwischt-alex',              'ext': 'mp4',              'title': 'Anna erwischt Alex!',              'description': 'Anna ist Alex\' Tochter bei Köln 50667.'          }, +        'params': { +            # rtmp download +            'skip_download': True, +        },      }]      def _real_extract(self, url): @@ -34,12 +41,18 @@ class RTL2IE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        vico_id = self._html_search_regex( -            r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') -        vivi_id = self._html_search_regex( -            r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') +        mobj = re.search( +            r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', +            webpage) +        if mobj: +            vico_id = mobj.group('vico_id') +            vivi_id = mobj.group('vivi_id') +        else: +            vico_id = self._html_search_regex( +                r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') +            vivi_id = self._html_search_regex( +                r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id')          info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id -        webpage = self._download_webpage(info_url, '')          info = self._download_json(info_url, video_id)          video_info = info['video'] diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index ecf4939cd..82b323cdd 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -18,6 +18,10 @@ class RTPIE(InfoExtractor):              'description': 'As paixões musicais de António Cartaxo e António Macedo',              'thumbnail': 're:^https?://.*\.jpg',          }, +        'params': { +            # rtmp download +            'skip_download': True, +        },      }, {          'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas',          'only_matching': True, diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 82cd98ac7..5b97d33ca 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,7 +6,7 @@ import re  import time  from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import compat_urllib_request, compat_urlparse  from ..utils import (      ExtractorError,      float_or_none, @@ -102,7 +102,9 @@ class RTVEALaCartaIE(InfoExtractor):          if info['state'] == 'DESPU':              raise ExtractorError('The video is no longer available', expected=True)          png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) -        png = self._download_webpage(png_url, video_id, 'Downloading url information') +        png_request = compat_urllib_request.Request(png_url) +        png_request.add_header('Referer', url) +        png = self._download_webpage(png_request, video_id, 'Downloading url information')          video_url = _decrypt_url(png)          if not video_url.endswith('.f4m'):              auth_url = video_url.replace( diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 4e22628d0..c67ad25ce 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -6,19 +6,19 @@ from ..compat import compat_urllib_parse_urlparse  from ..utils import (      determine_ext,      int_or_none, +    xpath_attr,      xpath_text,  )  class RuutuIE(InfoExtractor): -    _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)' +    _VALID_URL = r'https?://(?:www\.)?ruutu\.fi/video/(?P<id>\d+)'      _TESTS = [          { -            'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', +            'url': 'http://www.ruutu.fi/video/2058907',              'md5': 'ab2093f39be1ca8581963451b3c0234f',              'info_dict': {                  'id': '2058907', -                'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi',                  'ext': 'mp4',                  'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!',                  'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', @@ -28,14 +28,13 @@ class RuutuIE(InfoExtractor):              },          },          { -            'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa', +            'url': 'http://www.ruutu.fi/video/2057306',              'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9',              'info_dict': {                  'id': '2057306', -                'display_id': 'superpesis-katso-koko-kausi-ruudussa',                  'ext': 'mp4',                  'title': 'Superpesis: katso koko kausi Ruudussa', -                'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77', +                'description': 'md5:da2736052fef3b2bd5e0005e63c25eac',                  'thumbnail': 're:^https?://.*\.jpg$',                  'duration': 40,                  'age_limit': 0, @@ -44,29 +43,10 @@ class RuutuIE(InfoExtractor):      ]      def _real_extract(self, url): -        display_id = self._match_id(url) +        video_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) - -        video_id = self._search_regex( -            r'data-media-id="(\d+)"', webpage, 'media id') - -        video_xml_url = None - -        media_data = self._search_regex( -            r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage, -            'media data', default=None) -        if media_data: -            media_json = self._parse_json(media_data, display_id, fatal=False) -            if media_json: -                xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl') -                if xml_url: -                    video_xml_url = xml_url.replace('{ID}', video_id) - -        if not video_xml_url: -            video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id - -        video_xml = self._download_xml(video_xml_url, video_id) +        video_xml = self._download_xml( +            'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id)          formats = []          processed_urls = [] @@ -109,10 +89,9 @@ class RuutuIE(InfoExtractor):          return {              'id': video_id, -            'display_id': display_id, -            'title': self._og_search_title(webpage), -            'description': self._og_search_description(webpage), -            'thumbnail': self._og_search_thumbnail(webpage), +            'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), +            'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), +            'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'),              'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')),              'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')),              'formats': formats, diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index f3c80708c..a602af692 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -20,7 +20,6 @@ from ..utils import (  class SafariBaseIE(InfoExtractor):      _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/'      _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>' -    _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com'      _NETRC_MACHINE = 'safari'      _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' @@ -37,9 +36,7 @@ class SafariBaseIE(InfoExtractor):      def _login(self):          (username, password) = self._get_login_info()          if username is None: -            raise ExtractorError( -                self._ACCOUNT_CREDENTIALS_HINT, -                expected=True) +            self.raise_login_required('safaribooksonline.com account is required')          headers = std_headers          if 'Referer' not in headers: diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 220d39078..05f93904c 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -12,8 +12,8 @@ from ..utils import (  class ScreenwaveMediaIE(InfoExtractor): -    _VALID_URL = r'http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=(?P<id>.+)' - +    _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' +    EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1'      _TESTS = [{          'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911',          'only_matching': True, @@ -33,7 +33,7 @@ class ScreenwaveMediaIE(InfoExtractor):              'http://player.screenwavemedia.com/player.js',              video_id, 'Downloading playerconfig webpage') -        videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver') +        videoserver = self._search_regex(r'SWMServer\s*=\s*"([\d\.]+)"', playerdata, 'videoserver')          sources = self._parse_json(              js_to_json( @@ -56,6 +56,7 @@ class ScreenwaveMediaIE(InfoExtractor):          # Fallback to hardcoded sources if JS changes again          if not sources: +            self.report_warning('Falling back to a hardcoded list of streams')              sources = [{                  'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id),                  'type': 'mp4', diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index a07677686..c5636e8e9 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -14,17 +14,28 @@ from ..utils import (  class SharedIE(InfoExtractor): -    _VALID_URL = r'http://shared\.sx/(?P<id>[\da-z]{10})' +    IE_DESC = 'shared.sx and vivo.sx' +    _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' -    _TEST = { +    _TESTS = [{          'url': 'http://shared.sx/0060718775',          'md5': '106fefed92a8a2adb8c98e6a0652f49b',          'info_dict': {              'id': '0060718775',              'ext': 'mp4',              'title': 'Bmp4', +            'filesize': 1720110,          }, -    } +    }, { +        'url': 'http://vivo.sx/d7ddda0e78', +        'md5': '15b3af41be0b4fe01f4df075c2678b2c', +        'info_dict': { +            'id': 'd7ddda0e78', +            'ext': 'mp4', +            'title': 'Chicken', +            'filesize': 528031, +        }, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 93a7cfe15..35a81ee87 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -330,10 +330,7 @@ class SmotriBroadcastIE(InfoExtractor):              (username, password) = self._get_login_info()              if username is None: -                raise ExtractorError( -                    'Erotic broadcasts allowed only for registered users, ' -                    'use --username and --password options to provide account credentials.', -                    expected=True) +                self.raise_login_required('Erotic broadcasts allowed only for registered users')              login_form = {                  'login-hint53': '1', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 6ce86cbcd..ed5dcc0d3 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -309,7 +309,7 @@ class SoundcloudUserIE(SoundcloudIE):              'id': '114582580',              'title': 'The Akashic Chronicler (All)',          }, -        'playlist_mincount': 112, +        'playlist_mincount': 111,      }, {          'url': 'https://soundcloud.com/the-akashic-chronicler/tracks',          'info_dict': { @@ -330,14 +330,14 @@ class SoundcloudUserIE(SoundcloudIE):              'id': '114582580',              'title': 'The Akashic Chronicler (Reposts)',          }, -        'playlist_mincount': 9, +        'playlist_mincount': 7,      }, {          'url': 'https://soundcloud.com/the-akashic-chronicler/likes',          'info_dict': {              'id': '114582580',              'title': 'The Akashic Chronicler (Likes)',          }, -        'playlist_mincount': 333, +        'playlist_mincount': 321,      }, {          'url': 'https://soundcloud.com/grynpyret/spotlight',          'info_dict': { diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 5fa6faf18..9e8fb35b2 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -16,8 +16,9 @@ from ..aes import aes_decrypt_text  class SpankwireIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<id>[0-9]+)/?)' +    _TESTS = [{ +        # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4          'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',          'md5': '8bbfde12b101204b39e4b9fe7eb67095',          'info_dict': { @@ -30,14 +31,27 @@ class SpankwireIE(InfoExtractor):              'upload_date': '20070507',              'age_limit': 18,          } -    } +    }, { +        # download URL pattern: */mp4_<format_id>_<video_id>.mp4 +        'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', +        'md5': '09b3c20833308b736ae8902db2f8d7e6', +        'info_dict': { +            'id': '1921551', +            'ext': 'mp4', +            'title': 'Titcums Compiloation I', +            'description': 'cum on tits', +            'uploader': 'dannyh78999', +            'uploader_id': '3056053', +            'upload_date': '20150822', +            'age_limit': 18, +        }, +    }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('videoid') -        url = 'http://www.' + mobj.group('url') +        video_id = mobj.group('id') -        req = compat_urllib_request.Request(url) +        req = compat_urllib_request.Request('http://www.' + mobj.group('url'))          req.add_header('Cookie', 'age_verified=1')          webpage = self._download_webpage(req, video_id) @@ -54,7 +68,7 @@ class SpankwireIE(InfoExtractor):              r'by:\s*<a [^>]*>(.+?)</a>',              webpage, 'uploader', fatal=False)          uploader_id = self._html_search_regex( -            r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', +            r'by:\s*<a href="/(?:user/viewProfile|Profile\.aspx)\?.*?UserId=(\d+).*?"',              webpage, 'uploader id', fatal=False)          upload_date = unified_strdate(self._html_search_regex(              r'</a> on (.+?) at \d+:\d+', @@ -67,9 +81,10 @@ class SpankwireIE(InfoExtractor):              r'<span\s+id="spCommentCount"[^>]*>([\d,\.]+)</span>',              webpage, 'comment count', fatal=False)) -        video_urls = list(map( -            compat_urllib_parse_unquote, -            re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage))) +        videos = re.findall( +            r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) +        heights = [int(video[0]) for video in videos] +        video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos]))          if webpage.find('flashvars\.encrypted = "true"') != -1:              password = self._search_regex(                  r'flashvars\.video_title = "([^"]+)', @@ -79,21 +94,22 @@ class SpankwireIE(InfoExtractor):                  video_urls))          formats = [] -        for video_url in video_urls: +        for height, video_url in zip(heights, video_urls):              path = compat_urllib_parse_urlparse(video_url).path -            format = path.split('/')[4].split('_')[:2] -            resolution, bitrate_str = format -            format = "-".join(format) -            height = int(resolution.rstrip('Pp')) -            tbr = int(bitrate_str.rstrip('Kk')) -            formats.append({ +            _, quality = path.split('/')[4].split('_')[:2] +            f = {                  'url': video_url, -                'resolution': resolution, -                'format': format, -                'tbr': tbr,                  'height': height, -                'format_id': format, -            }) +            } +            tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) +            if tbr: +                f.update({ +                    'tbr': int(tbr), +                    'format_id': '%dp' % height, +                }) +            else: +                f['format_id'] = quality +            formats.append(f)          self._sort_formats(formats)          age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index adaec3375..25edc3100 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  import re -import json  import time  import hmac  import binascii @@ -29,7 +28,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})  class ThePlatformBaseIE(InfoExtractor): -    def _extract_theplatform_smil_formats(self, smil_url, video_id, note='Downloading SMIL data'): +    def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):          meta = self._download_xml(smil_url, video_id, note=note)          try:              error_msg = next( @@ -55,12 +54,13 @@ class ThePlatformBaseIE(InfoExtractor):          self._sort_formats(formats) -        return formats +        subtitles = self._parse_smil_subtitles(meta, default_ns) + +        return formats, subtitles      def get_metadata(self, path, video_id):          info_url = 'http://link.theplatform.com/s/%s?format=preview' % path -        info_json = self._download_webpage(info_url, video_id) -        info = json.loads(info_json) +        info = self._download_json(info_url, video_id)          subtitles = {}          captions = info.get('captions') @@ -210,12 +210,14 @@ class ThePlatformIE(ThePlatformBaseIE):          if sig:              smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) -        formats = self._extract_theplatform_smil_formats(smil_url, video_id) +        formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)          ret = self.get_metadata(path, video_id) +        combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)          ret.update({              'id': video_id,              'formats': formats, +            'subtitles': combined_subtitles,          })          return ret @@ -253,6 +255,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):          entry = feed['entries'][0]          formats = [] +        subtitles = {}          first_video_id = None          duration = None          for item in entry['media$content']: @@ -261,7 +264,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE):              if first_video_id is None:                  first_video_id = cur_video_id                  duration = float_or_none(item.get('plfile$duration')) -            formats.extend(self._extract_theplatform_smil_formats(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)) +            cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id) +            formats.extend(cur_formats) +            subtitles = self._merge_subtitles(subtitles, cur_subtitles)          self._sort_formats(formats) @@ -275,9 +280,11 @@ class ThePlatformFeedIE(ThePlatformBaseIE):          categories = [item['media$name'] for item in entry.get('media$categories', [])]          ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) +        subtitles = self._merge_subtitles(subtitles, ret['subtitles'])          ret.update({              'id': video_id,              'formats': formats, +            'subtitles': subtitles,              'thumbnails': thumbnails,              'duration': duration,              'timestamp': timestamp, diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 2c4b21807..4f86b3ee9 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -60,9 +60,7 @@ class TubiTvIE(InfoExtractor):          webpage = self._download_webpage(url, video_id)          if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): -            raise ExtractorError( -                'This video requires login, use --username and --password ' -                'options to provide account credentials.', expected=True) +            self.raise_login_required('This video requires login')          title = self._og_search_title(webpage)          description = self._og_search_description(webpage) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4a0eaf65f..365d8b4bf 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -70,9 +70,7 @@ class UdemyIE(InfoExtractor):      def _login(self):          (username, password) = self._get_login_info()          if username is None: -            raise ExtractorError( -                'Udemy account is required, use --username and --password options to provide account credentials.', -                expected=True) +            self.raise_login_required('Udemy account is required')          login_popup = self._download_webpage(              self._LOGIN_URL, None, 'Downloading login popup') diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index f4c0f5702..4098e4629 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -1,18 +1,38 @@ -# coding=utf-8 +# coding: utf-8  from __future__ import unicode_literals  import re  import hashlib  from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( +    compat_str, +    compat_urllib_parse, +    compat_urllib_request, +)  from ..utils import (      int_or_none,      float_or_none,  ) -class YandexMusicBaseIE(InfoExtractor): +class YandexMusicTrackIE(InfoExtractor): +    IE_NAME = 'yandexmusic:track' +    IE_DESC = 'Яндекс.Музыка - Трек' +    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://music.yandex.ru/album/540508/track/4878838', +        'md5': 'f496818aa2f60b6c0062980d2e00dc20', +        'info_dict': { +            'id': '4878838', +            'ext': 'mp3', +            'title': 'Carlo Ambrosio - Gypsy Eyes 1', +            'filesize': 4628061, +            'duration': 193.04, +        } +    } +      def _get_track_url(self, storage_dir, track_id):          data = self._download_json(              'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' @@ -35,24 +55,6 @@ class YandexMusicBaseIE(InfoExtractor):              'duration': float_or_none(track.get('durationMs'), 1000),          } - -class YandexMusicTrackIE(YandexMusicBaseIE): -    IE_NAME = 'yandexmusic:track' -    IE_DESC = 'Яндекс.Музыка - Трек' -    _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)' - -    _TEST = { -        'url': 'http://music.yandex.ru/album/540508/track/4878838', -        'md5': 'f496818aa2f60b6c0062980d2e00dc20', -        'info_dict': { -            'id': '4878838', -            'ext': 'mp3', -            'title': 'Carlo Ambrosio - Gypsy Eyes 1', -            'filesize': 4628061, -            'duration': 193.04, -        } -    } -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          album_id, track_id = mobj.group('album_id'), mobj.group('id') @@ -64,7 +66,15 @@ class YandexMusicTrackIE(YandexMusicBaseIE):          return self._get_track_info(track) -class YandexMusicAlbumIE(YandexMusicBaseIE): +class YandexMusicPlaylistBaseIE(InfoExtractor): +    def _build_playlist(self, tracks): +        return [ +            self.url_result( +                'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) +            for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] + + +class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):      IE_NAME = 'yandexmusic:album'      IE_DESC = 'Яндекс.Музыка - Альбом'      _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)' @@ -85,7 +95,7 @@ class YandexMusicAlbumIE(YandexMusicBaseIE):              'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id,              album_id, 'Downloading album JSON') -        entries = [self._get_track_info(track) for track in album['volumes'][0]] +        entries = self._build_playlist(album['volumes'][0])          title = '%s - %s' % (album['artists'][0]['name'], album['title'])          year = album.get('year') @@ -95,12 +105,12 @@ class YandexMusicAlbumIE(YandexMusicBaseIE):          return self.playlist_result(entries, compat_str(album['id']), title) -class YandexMusicPlaylistIE(YandexMusicBaseIE): +class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):      IE_NAME = 'yandexmusic:playlist'      IE_DESC = 'Яндекс.Музыка - Плейлист'      _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' -    _TEST = { +    _TESTS = [{          'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',          'info_dict': {              'id': '1245', @@ -108,20 +118,54 @@ class YandexMusicPlaylistIE(YandexMusicBaseIE):              'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',          },          'playlist_count': 6, -    } +    }, { +        # playlist exceeding the limit of 150 tracks shipped with webpage (see +        # https://github.com/rg3/youtube-dl/issues/6666) +        'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', +        'info_dict': { +            'id': '1036', +            'title': 'Музыка 90-х', +        }, +        'playlist_count': 310, +    }]      def _real_extract(self, url):          playlist_id = self._match_id(url)          webpage = self._download_webpage(url, playlist_id) -        playlist = self._parse_json( +        mu = self._parse_json(              self._search_regex(                  r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'), -            playlist_id)['pageData']['playlist'] - -        entries = [self._get_track_info(track) for track in playlist['tracks']] +            playlist_id) + +        playlist = mu['pageData']['playlist'] +        tracks, track_ids = playlist['tracks'], playlist['trackIds'] + +        # tracks dictionary shipped with webpage is limited to 150 tracks, +        # missing tracks should be retrieved manually. +        if len(tracks) < len(track_ids): +            present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) +            missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) +            request = compat_urllib_request.Request( +                'https://music.yandex.ru/handlers/track-entries.jsx', +                compat_urllib_parse.urlencode({ +                    'entries': ','.join(missing_track_ids), +                    'lang': mu.get('settings', {}).get('lang', 'en'), +                    'external-domain': 'music.yandex.ru', +                    'overembed': 'false', +                    'sign': mu.get('authData', {}).get('user', {}).get('sign'), +                    'strict': 'true', +                }).encode('utf-8')) +            request.add_header('Referer', url) +            request.add_header('X-Requested-With', 'XMLHttpRequest') + +            missing_tracks = self._download_json( +                request, playlist_id, 'Downloading missing tracks JSON', fatal=False) +            if missing_tracks: +                tracks.extend(missing_tracks)          return self.playlist_result( -            entries, compat_str(playlist_id), +            self._build_playlist(tracks), +            compat_str(playlist_id),              playlist['title'], playlist.get('description')) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 78caeb8b3..2e81d9223 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -49,6 +49,17 @@ class YoukuIE(InfoExtractor):          },          'playlist_count': 13,          'skip': 'Available in China only', +    }, { +        'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', +        'note': 'Video protected with password', +        'info_dict': { +            'id': 'XNjA1NzA2Njgw', +            'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', +        }, +        'playlist_count': 19, +        'params': { +            'videopassword': '100600', +        },      }]      def construct_video_urls(self, data1, data2): @@ -185,9 +196,15 @@ class YoukuIE(InfoExtractor):              raw_data = self._download_json(req, video_id, note=note)              return raw_data['data'][0] +        video_password = self._downloader.params.get('videopassword', None) +          # request basic data +        basic_data_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id +        if video_password: +            basic_data_url += '?password=%s' % video_password +          data1 = retrieve_data( -            'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id, +            basic_data_url,              'Downloading JSON metadata 1')          data2 = retrieve_data(              'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e2da46e3..030ec70ca 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -660,7 +660,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      def _extract_signature_function(self, video_id, player_url, example_sig):          id_m = re.match( -            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$', +            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P<ext>[a-z]+)$',              player_url)          if not id_m:              raise ExtractorError('Cannot identify player %r' % player_url) @@ -1243,7 +1243,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]              if 'rtmpe%3Dyes' in encoded_url_map:                  raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) -            url_map = {} +            formats = []              for url_data_str in encoded_url_map.split(','):                  url_data = compat_parse_qs(url_data_str)                  if 'itag' not in url_data or 'url' not in url_data: @@ -1289,7 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                                  player_desc = 'flash player %s' % player_version                              else:                                  player_version = self._search_regex( -                                    r'html5player-([^/]+?)(?:/html5player)?\.js', +                                    r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',                                      player_url,                                      'html5 player', fatal=False)                                  player_desc = 'html5 player %s' % player_version @@ -1303,8 +1303,50 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      url += '&signature=' + signature                  if 'ratebypass' not in url:                      url += '&ratebypass=yes' -                url_map[format_id] = url -            formats = _map_to_format_list(url_map) + +                # Some itags are not included in DASH manifest thus corresponding formats will +                # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). +                # Trying to extract metadata from url_encoded_fmt_stream_map entry. +                mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) +                width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) +                dct = { +                    'format_id': format_id, +                    'url': url, +                    'player_url': player_url, +                    'filesize': int_or_none(url_data.get('clen', [None])[0]), +                    'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), +                    'width': width, +                    'height': height, +                    'fps': int_or_none(url_data.get('fps', [None])[0]), +                    'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0], +                } +                type_ = url_data.get('type', [None])[0] +                if type_: +                    type_split = type_.split(';') +                    kind_ext = type_split[0].split('/') +                    if len(kind_ext) == 2: +                        kind, ext = kind_ext +                        dct['ext'] = ext +                        if kind in ('audio', 'video'): +                            codecs = None +                            for mobj in re.finditer( +                                    r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_): +                                if mobj.group('key') == 'codecs': +                                    codecs = mobj.group('val') +                                    break +                            if codecs: +                                codecs = codecs.split(',') +                                if len(codecs) == 2: +                                    acodec, vcodec = codecs[0], codecs[1] +                                else: +                                    acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0]) +                                dct.update({ +                                    'acodec': acodec, +                                    'vcodec': vcodec, +                                }) +                if format_id in self._formats: +                    dct.update(self._formats[format_id]) +                formats.append(dct)          elif video_info.get('hlsvp'):              manifest_url = video_info['hlsvp'][0]              url_map = self._extract_from_m3u8(manifest_url, video_id) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9016e3498..8c4ff12bd 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -320,7 +320,7 @@ def parseOpts(overrideArguments=None):      authentication.add_option(          '--video-password',          dest='videopassword', metavar='PASSWORD', -        help='Video password (vimeo, smotri)') +        help='Video password (vimeo, smotri, youku)')      video_format = optparse.OptionGroup(parser, 'Video Format Options')      video_format.add_option( diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index 4191d040b..150ef9173 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -4,6 +4,7 @@ import os  from ..utils import (      PostProcessingError, +    cli_configuration_args,      encodeFilename,  ) @@ -61,11 +62,7 @@ class PostProcessor(object):              self._downloader.report_warning(errnote)      def _configuration_args(self, default=[]): -        pp_args = self._downloader.params.get('postprocessor_args') -        if pp_args is None: -            return default -        assert isinstance(pp_args, list) -        return pp_args +        return cli_configuration_args(self.params, 'postprocessor_args', default)  class AudioConversionError(PostProcessingError): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e265c7574..79381b380 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -141,7 +141,7 @@ def write_json_file(obj, fn):  if sys.version_info >= (2, 7):      def find_xpath_attr(node, xpath, key, val=None):          """ Find the xpath xpath[@key=val] """ -        assert re.match(r'^[a-zA-Z-]+$', key) +        assert re.match(r'^[a-zA-Z_-]+$', key)          if val:              assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)          expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) @@ -176,12 +176,12 @@ def xpath_with_ns(path, ns_map):      return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): +def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):      if sys.version_info < (2, 7):  # Crazy 2.6          xpath = xpath.encode('ascii')      n = node.find(xpath) -    if n is None or n.text is None: +    if n is None:          if default is not NO_DEFAULT:              return default          elif fatal: @@ -189,9 +189,37 @@ def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):              raise ExtractorError('Could not find XML element %s' % name)          else:              return None +    return n + + +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): +    n = xpath_element(node, xpath, name, fatal=fatal, default=default) +    if n is None or n == default: +        return n +    if n.text is None: +        if default is not NO_DEFAULT: +            return default +        elif fatal: +            name = xpath if name is None else name +            raise ExtractorError('Could not find XML element\'s text %s' % name) +        else: +            return None      return n.text +def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): +    n = find_xpath_attr(node, xpath, key) +    if n is None: +        if default is not NO_DEFAULT: +            return default +        elif fatal: +            name = '%s[@%s]' % (xpath, key) if name is None else name +            raise ExtractorError('Could not find XML attribute %s' % name) +        else: +            return None +    return n.attrib[key] + +  def get_element_by_id(id, html):      """Return the content of the tag with the specified ID in the passed HTML document"""      return get_element_by_attribute("id", id, html) @@ -587,6 +615,11 @@ class ContentTooShortError(Exception):  def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): +    # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting +    # expected HTTP responses to meet HTTP/1.0 or later (see also +    # https://github.com/rg3/youtube-dl/issues/6727) +    if sys.version_info < (3, 0): +        kwargs['strict'] = True      hc = http_class(*args, **kwargs)      source_address = ydl_handler._params.get('source_address')      if source_address is not None: @@ -1918,6 +1951,32 @@ def dfxp2srt(dfxp_data):      return ''.join(out) +def cli_option(params, command_option, param): +    param = params.get(param) +    return [command_option, param] if param is not None else [] + + +def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): +    param = params.get(param) +    assert isinstance(param, bool) +    if separator: +        return [command_option + separator + (true_value if param else false_value)] +    return [command_option, true_value if param else false_value] + + +def cli_valueless_option(params, command_option, param, expected_value=True): +    param = params.get(param) +    return [command_option] if param == expected_value else [] + + +def cli_configuration_args(params, param, default=[]): +    ex_args = params.get(param) +    if ex_args is None: +        return default +    assert isinstance(ex_args, list) +    return ex_args + +  class ISO639Utils(object):      # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt      _lang_map = { diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c090c6df7..6bc689b75 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2015.08.16.1' +__version__ = '2015.09.03' | 
