diff options
75 files changed, 1497 insertions, 473 deletions
| diff --git a/.gitignore b/.gitignore index 86312d4e4..0422adf44 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,5 @@ updates_key.pem  test/testdata  .tox  youtube-dl.zsh +.idea +.idea/*
\ No newline at end of file diff --git a/.travis.yml b/.travis.yml index c6cc7a994..f14014414 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,6 @@ notifications:    email:      - filippo.valsorda@gmail.com      - phihag@phihag.de -    - jaime.marquinez.ferrandiz+travis@gmail.com      - yasoob.khld@gmail.com  #  irc:  #    channels: @@ -97,3 +97,7 @@ Petr Kutalek  Will Glynn  Max Reimann  Cédric Luthi +Thijs Vermeir +Joel Leclerc +Christopher Krooss +Ondřej Caletka diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0ff7b395a..7917abfc6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,7 +44,7 @@ In particular, every site support request issue should only pertain to services  ###  Is anyone going to need the feature? -Only post features that you (or an incapicated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. +Only post features that you (or an incapacitated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them.  ###  Is your question about youtube-dl? @@ -46,7 +46,7 @@ test:  ot: offlinetest  offlinetest: codetest -	nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations +	nosetests --verbose test --exclude test_download --exclude test_age_restriction --exclude test_subtitles --exclude test_write_annotations --exclude test_youtube_lists  tar: youtube-dl.tar.gz @@ -63,7 +63,7 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py  	chmod a+x youtube-dl  README.md: youtube_dl/*.py youtube_dl/*/*.py -	COLUMNS=80 python -m youtube_dl --help | python devscripts/make_readme.py +	COLUMNS=80 python youtube_dl/__main__.py --help | python devscripts/make_readme.py  CONTRIBUTING.md: README.md  	python devscripts/make_contributing.py README.md CONTRIBUTING.md @@ -219,6 +219,8 @@ which means you can modify it, redistribute it or use it however you like.                                       for each command-line argument. If the URL                                       refers to a playlist, dump the whole                                       playlist information in a single line. +    --print-json                     Be quiet and print the video information as +                                     JSON (video is still being downloaded).      --newline                        output progress bar as new lines      --no-progress                    do not print progress bar      --console-title                  display progress in console titlebar @@ -248,14 +250,15 @@ which means you can modify it, redistribute it or use it however you like.  ## Video Format Options:      -f, --format FORMAT              video format code, specify the order of -                                     preference using slashes: -f 22/17/18 .  -f -                                     mp4 , -f m4a and  -f flv  are also -                                     supported. You can also use the special -                                     names "best", "bestvideo", "bestaudio", -                                     "worst", "worstvideo" and "worstaudio". By -                                     default, youtube-dl will pick the best -                                     quality. Use commas to download multiple -                                     audio formats, such as -f +                                     preference using slashes, as in -f 22/17/18 +                                     .  Instead of format codes, you can select +                                     by extension for the extensions aac, m4a, +                                     mp3, mp4, ogg, wav, webm. You can also use +                                     the special names "best", "bestvideo", +                                     "bestaudio", "worst".  By default, youtube- +                                     dl will pick the best quality. Use commas +                                     to download multiple audio formats, such as +                                     -f                                       136/137/mp4/bestvideo,140/m4a/bestaudio.                                       You can merge the video and audio of two                                       formats into a single file using -f <video- @@ -326,7 +329,7 @@ which means you can modify it, redistribute it or use it however you like.  # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<user name>\youtube-dl.conf`.  # OUTPUT TEMPLATE @@ -449,6 +452,14 @@ Since June 2012 (#342) youtube-dl is packed as an executable zipfile, simply unz  To run the exe you need to install first the [Microsoft Visual C++ 2008 Redistributable Package](http://www.microsoft.com/en-us/download/details.aspx?id=29). +### On Windows, how should I set up ffmpeg and youtube-dl? Where should I put the exe files? + +If you put youtube-dl and ffmpeg in the same directory that you're running the command from, it will work, but that's rather cumbersome. + +To make a different directory work - either for ffmpeg, or for youtube-dl, or for both - simply create the directory (say, `C:\bin`, or `C:\Users\<User name>\bin`), put all the executables directly in there, and then [set your PATH environment variable](https://www.java.com/en/download/help/path.xml) to include that directory. + +From then on, after restarting your shell, you will be able to access both youtube-dl and ffmpeg (and youtube-dl will be able to find ffmpeg) by simply typing `youtube-dl` or `ffmpeg`, no matter what directory you're in. +  ### How can I detect whether a given URL is supported by youtube-dl?  For one, have a look at the [list of supported sites](docs/supportedsites). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/v/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. diff --git a/devscripts/gh-pages/update-sites.py b/devscripts/gh-pages/update-sites.py index f0f0481c7..d3ef5f0b5 100755 --- a/devscripts/gh-pages/update-sites.py +++ b/devscripts/gh-pages/update-sites.py @@ -16,7 +16,7 @@ def main():          template = tmplf.read()      ie_htmls = [] -    for ie in sorted(youtube_dl.gen_extractors(), key=lambda i: i.IE_NAME.lower()): +    for ie in youtube_dl.list_extractors(age_limit=None):          ie_html = '<b>{}</b>'.format(ie.IE_NAME)          ie_desc = getattr(ie, 'IE_DESC', None)          if ie_desc is False: diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index 140010644..3df4385a6 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -23,12 +23,12 @@ def main():      def gen_ies_md(ies):          for ie in ies: -            ie_md = '**{}**'.format(ie.IE_NAME) +            ie_md = '**{0}**'.format(ie.IE_NAME)              ie_desc = getattr(ie, 'IE_DESC', None)              if ie_desc is False:                  continue              if ie_desc is not None: -                ie_md += ': {}'.format(ie.IE_DESC) +                ie_md += ': {0}'.format(ie.IE_DESC)              if not ie.working():                  ie_md += ' (Currently broken)'              yield ie_md diff --git a/test/helper.py b/test/helper.py index 96d58b7c1..c416f388c 100644 --- a/test/helper.py +++ b/test/helper.py @@ -82,18 +82,8 @@ class FakeYDL(YoutubeDL):  def gettestcases(include_onlymatching=False):      for ie in youtube_dl.extractor.gen_extractors(): -        t = getattr(ie, '_TEST', None) -        if t: -            assert not hasattr(ie, '_TESTS'), \ -                '%s has _TEST and _TESTS' % type(ie).__name__ -            tests = [t] -        else: -            tests = getattr(ie, '_TESTS', []) -        for t in tests: -            if not include_onlymatching and t.get('only_matching', False): -                continue -            t['name'] = type(ie).__name__[:-len('IE')] -            yield t +        for tc in ie.get_testcases(include_onlymatching): +            yield tc  md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() @@ -120,6 +110,20 @@ def expect_info_dict(self, got_dict, expected_dict):          else:              if isinstance(expected, compat_str) and expected.startswith('md5:'):                  got = 'md5:' + md5(got_dict.get(info_field)) +            elif isinstance(expected, compat_str) and expected.startswith('mincount:'): +                got = got_dict.get(info_field) +                self.assertTrue( +                    isinstance(got, list), +                    'Expected field %s to be a list, but it is of type %s' % ( +                        info_field, type(got).__name__)) +                expected_num = int(expected.partition(':')[2]) +                assertGreaterEqual( +                    self, len(got), expected_num, +                    'Expected %d items in field %s, but only got %d' % ( +                        expected_num, info_field, len(got) +                    ) +                ) +                continue              else:                  got = got_dict.get(info_field)              self.assertEqual(expected, got, diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 13c18ed95..be8d12997 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -40,5 +40,23 @@ class TestInfoExtractor(unittest.TestCase):          self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')          self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') +    def test_html_search_meta(self): +        ie = self.ie +        html = ''' +            <meta name="a" content="1" /> +            <meta name='b' content='2'> +            <meta name="c" content='3'> +            <meta name=d content='4'> +            <meta property="e" content='5' > +            <meta content="6" name="f"> +        ''' + +        self.assertEqual(ie._html_search_meta('a', html), '1') +        self.assertEqual(ie._html_search_meta('b', html), '2') +        self.assertEqual(ie._html_search_meta('c', html), '3') +        self.assertEqual(ie._html_search_meta('d', html), '4') +        self.assertEqual(ie._html_search_meta('e', html), '5') +        self.assertEqual(ie._html_search_meta('f', html), '6') +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f8e4f930e..85d87f2c3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -8,6 +8,8 @@ import sys  import unittest  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import copy +  from test.helper import FakeYDL, assertRegexpMatches  from youtube_dl import YoutubeDL  from youtube_dl.extractor import YoutubeIE @@ -192,6 +194,37 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], 'vid-high') +    def test_format_selection_audio_exts(self): +        formats = [ +            {'format_id': 'mp3-64', 'ext': 'mp3', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, +            {'format_id': 'ogg-64', 'ext': 'ogg', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, +            {'format_id': 'aac-64', 'ext': 'aac', 'abr': 64, 'url': 'http://_', 'vcodec': 'none'}, +            {'format_id': 'mp3-32', 'ext': 'mp3', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'}, +            {'format_id': 'aac-32', 'ext': 'aac', 'abr': 32, 'url': 'http://_', 'vcodec': 'none'}, +        ] + +        info_dict = _make_result(formats) +        ydl = YDL({'format': 'best'}) +        ie = YoutubeIE(ydl) +        ie._sort_formats(info_dict['formats']) +        ydl.process_ie_result(copy.deepcopy(info_dict)) +        downloaded = ydl.downloaded_info_dicts[0] +        self.assertEqual(downloaded['format_id'], 'aac-64') + +        ydl = YDL({'format': 'mp3'}) +        ie = YoutubeIE(ydl) +        ie._sort_formats(info_dict['formats']) +        ydl.process_ie_result(copy.deepcopy(info_dict)) +        downloaded = ydl.downloaded_info_dicts[0] +        self.assertEqual(downloaded['format_id'], 'mp3-64') + +        ydl = YDL({'prefer_free_formats': True}) +        ie = YoutubeIE(ydl) +        ie._sort_formats(info_dict['formats']) +        ydl.process_ie_result(copy.deepcopy(info_dict)) +        downloaded = ydl.downloaded_info_dicts[0] +        self.assertEqual(downloaded['format_id'], 'ogg-64') +      def test_format_selection_video(self):          formats = [              {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, @@ -218,7 +251,7 @@ class TestFormatSelection(unittest.TestCase):              # 3D              '85', '84', '102', '83', '101', '82', '100',              # Dash video -            '138', '137', '248', '136', '247', '135', '246', +            '137', '248', '136', '247', '135', '246',              '245', '244', '134', '243', '133', '242', '160',              # Dash audio              '141', '172', '140', '171', '139', diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 5be065c43..6f5513faa 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -45,11 +45,6 @@ class TestAgeRestriction(unittest.TestCase):              'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',              '505835.mp4', 2, old_age=25) -    def test_pornotube(self): -        self._assert_restricted( -            'http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing', -            '1689755.flv', 13) -  if __name__ == '__main__':      unittest.main() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index d34565191..6336dd317 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -17,6 +17,7 @@ from youtube_dl.extractor import (      TEDIE,      VimeoIE,      WallaIE, +    CeskaTelevizeIE,  ) @@ -317,5 +318,32 @@ class TestWallaSubtitles(BaseTestSubtitles):          self.assertEqual(len(subtitles), 0) +class TestCeskaTelevizeSubtitles(BaseTestSubtitles): +    url = 'http://www.ceskatelevize.cz/ivysilani/10600540290-u6-uzasny-svet-techniky' +    IE = CeskaTelevizeIE + +    def test_list_subtitles(self): +        self.DL.expect_warning('Automatic Captions not supported by this server') +        self.DL.params['listsubtitles'] = True +        info_dict = self.getInfoDict() +        self.assertEqual(info_dict, None) + +    def test_allsubtitles(self): +        self.DL.expect_warning('Automatic Captions not supported by this server') +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(set(subtitles.keys()), set(['cs'])) +        self.assertEqual(md5(subtitles['cs']), '9bf52d9549533c32c427e264bf0847d4') + +    def test_nosubtitles(self): +        self.DL.expect_warning('video doesn\'t have subtitles') +        self.url = 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220' +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(len(subtitles), 0) + +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index dd49a6d17..16e1a1ddf 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -16,6 +16,7 @@ import json  import xml.etree.ElementTree  from youtube_dl.utils import ( +    age_restricted,      args_to_str,      clean_html,      DateRange, @@ -402,5 +403,12 @@ Trying to open render node...  Success at /dev/dri/renderD128.  ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') +    def test_age_restricted(self): +        self.assertFalse(age_restricted(None, 10))  # unrestricted content +        self.assertFalse(age_restricted(1, None))  # unrestricted policy +        self.assertFalse(age_restricted(8, 10)) +        self.assertTrue(age_restricted(18, 14)) +        self.assertFalse(age_restricted(18, 18)) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e2b823f66..61675d8ec 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -63,6 +63,7 @@ from .utils import (      YoutubeDLHandler,      prepend_extension,      args_to_str, +    age_restricted,  )  from .cache import Cache  from .extractor import get_info_extractor, gen_extractors @@ -202,6 +203,7 @@ class YoutubeDL(object):                         Progress hooks are guaranteed to be called at least once                         (with status "finished") if the download is successful. +    merge_output_format: Extension to use when merging formats.      The following parameters are not used by YoutubeDL itself, they are used by @@ -550,13 +552,8 @@ class YoutubeDL(object):              max_views = self.params.get('max_views')              if max_views is not None and view_count > max_views:                  return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) -        age_limit = self.params.get('age_limit') -        if age_limit is not None: -            actual_age_limit = info_dict.get('age_limit') -            if actual_age_limit is None: -                actual_age_limit = 0 -            if age_limit < actual_age_limit: -                return 'Skipping "' + title + '" because it is age restricted' +        if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')): +            return 'Skipping "%s" because it is age restricted' % title          if self.in_download_archive(info_dict):              return '%s has already been recorded in archive' % video_title          return None @@ -790,7 +787,7 @@ class YoutubeDL(object):              if video_formats:                  return video_formats[0]          else: -            extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a'] +            extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']              if format_spec in extensions:                  filter_f = lambda f: f['ext'] == format_spec              else: @@ -913,10 +910,23 @@ class YoutubeDL(object):                                                    'contain the video, try using '                                                    '"-f %s+%s"' % (format_2, format_1))                                  return +                            output_ext = ( +                                formats_info[0]['ext'] +                                if self.params.get('merge_output_format') is None +                                else self.params['merge_output_format'])                              selected_format = {                                  'requested_formats': formats_info,                                  'format': rf,                                  'ext': formats_info[0]['ext'], +                                'width': formats_info[0].get('width'), +                                'height': formats_info[0].get('height'), +                                'resolution': formats_info[0].get('resolution'), +                                'fps': formats_info[0].get('fps'), +                                'vcodec': formats_info[0].get('vcodec'), +                                'vbr': formats_info[0].get('vbr'), +                                'acodec': formats_info[1].get('acodec'), +                                'abr': formats_info[1].get('abr'), +                                'ext': output_ext,                              }                          else:                              selected_format = None @@ -1333,7 +1343,9 @@ class YoutubeDL(object):          formats = info_dict.get('formats', [info_dict])          idlen = max(len('format code'),                      max(len(f['format_id']) for f in formats)) -        formats_s = [line(f, idlen) for f in formats] +        formats_s = [ +            line(f, idlen) for f in formats +            if f.get('preference') is None or f['preference'] >= -1000]          if len(formats) > 1:              formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'              formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)' diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index e79320323..8e7b74466 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -38,7 +38,7 @@ from .update import update_self  from .downloader import (      FileDownloader,  ) -from .extractor import gen_extractors +from .extractor import gen_extractors, list_extractors  from .YoutubeDL import YoutubeDL @@ -95,24 +95,22 @@ def _real_main(argv=None):      _enc = preferredencoding()      all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] -    extractors = gen_extractors() -      if opts.list_extractors: -        for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()): +        for ie in list_extractors(opts.age_limit):              compat_print(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie._WORKING else ''))              matchedUrls = [url for url in all_urls if ie.suitable(url)]              for mu in matchedUrls:                  compat_print('  ' + mu)          sys.exit(0)      if opts.list_extractor_descriptions: -        for ie in sorted(extractors, key=lambda ie: ie.IE_NAME.lower()): +        for ie in list_extractors(opts.age_limit):              if not ie._WORKING:                  continue              desc = getattr(ie, 'IE_DESC', ie.IE_NAME)              if desc is False:                  continue              if hasattr(ie, 'SEARCH_KEY'): -                _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny') +                _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow')                  _COUNTS = ('', '5', '10', 'all')                  desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES))              compat_print(desc) @@ -168,6 +166,7 @@ def _real_main(argv=None):      if opts.recodevideo is not None:          if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:              parser.error('invalid video recode format specified') +      if opts.date is not None:          date = DateRange.day(opts.date)      else: @@ -199,7 +198,8 @@ def _real_main(argv=None):                       ' file! Use "{0}.%(ext)s" instead of "{0}" as the output'                       ' template'.format(outtmpl)) -    any_printing = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json +    any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json +    any_printing = opts.print_json      download_archive_fn = compat_expanduser(opts.download_archive) if opts.download_archive is not None else opts.download_archive      # PostProcessors @@ -245,7 +245,7 @@ def _real_main(argv=None):          'password': opts.password,          'twofactor': opts.twofactor,          'videopassword': opts.videopassword, -        'quiet': (opts.quiet or any_printing), +        'quiet': (opts.quiet or any_getting or any_printing),          'no_warnings': opts.no_warnings,          'forceurl': opts.geturl,          'forcetitle': opts.gettitle, @@ -255,9 +255,9 @@ def _real_main(argv=None):          'forceduration': opts.getduration,          'forcefilename': opts.getfilename,          'forceformat': opts.getformat, -        'forcejson': opts.dumpjson, +        'forcejson': opts.dumpjson or opts.print_json,          'dump_single_json': opts.dump_single_json, -        'simulate': opts.simulate or any_printing, +        'simulate': opts.simulate or any_getting,          'skip_download': opts.skip_download,          'format': opts.format,          'format_limit': opts.format_limit, @@ -324,6 +324,7 @@ def _real_main(argv=None):          'encoding': opts.encoding,          'exec_cmd': opts.exec_cmd,          'extract_flat': opts.extract_flat, +        'merge_output_format': opts.merge_output_format,          'postprocessors': postprocessors,      } @@ -365,3 +366,5 @@ def main(argv=None):          sys.exit('ERROR: fixed output name but more than one file to download')      except KeyboardInterrupt:          sys.exit('\nERROR: Interrupted by user') + +__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index f9f6f3e73..c460c167a 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -187,24 +187,34 @@ def build_fragments_list(boot_info):      return res -def write_flv_header(stream, metadata): -    """Writes the FLV header and the metadata to stream""" +def write_unsigned_int(stream, val): +    stream.write(struct_pack('!I', val)) + + +def write_unsigned_int_24(stream, val): +    stream.write(struct_pack('!I', val)[1:]) + + +def write_flv_header(stream): +    """Writes the FLV header to stream"""      # FLV header      stream.write(b'FLV\x01')      stream.write(b'\x05')      stream.write(b'\x00\x00\x00\x09') -    # FLV File body      stream.write(b'\x00\x00\x00\x00') -    # FLVTAG -    # Script data -    stream.write(b'\x12') -    # Size of the metadata with 3 bytes -    stream.write(struct_pack('!L', len(metadata))[1:]) -    stream.write(b'\x00\x00\x00\x00\x00\x00\x00') -    stream.write(metadata) -    # Magic numbers extracted from the output files produced by AdobeHDS.php -    # (https://github.com/K-S-V/Scripts) -    stream.write(b'\x00\x00\x01\x73') + + +def write_metadata_tag(stream, metadata): +    """Writes optional metadata tag to stream""" +    SCRIPT_TAG = b'\x12' +    FLV_TAG_HEADER_LEN = 11 + +    if metadata: +        stream.write(SCRIPT_TAG) +        write_unsigned_int_24(stream, len(metadata)) +        stream.write(b'\x00\x00\x00\x00\x00\x00\x00') +        stream.write(metadata) +        write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata))  def _add_ns(prop): @@ -256,7 +266,11 @@ class F4mFD(FileDownloader):              bootstrap = self.ydl.urlopen(bootstrap_url).read()          else:              bootstrap = base64.b64decode(bootstrap_node.text) -        metadata = base64.b64decode(media.find(_add_ns('metadata')).text) +        metadata_node = media.find(_add_ns('metadata')) +        if metadata_node is not None: +            metadata = base64.b64decode(metadata_node.text) +        else: +            metadata = None          boot_info = read_bootstrap_info(bootstrap)          fragments_list = build_fragments_list(boot_info) @@ -269,7 +283,8 @@ class F4mFD(FileDownloader):          tmpfilename = self.temp_name(filename)          (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') -        write_flv_header(dest_stream, metadata) +        write_flv_header(dest_stream) +        write_metadata_tag(dest_stream, metadata)          # This dict stores the download progress, it's updated by the progress          # hook diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 5bb0f3cfd..aa58b52ab 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -11,7 +11,6 @@ from ..compat import (      compat_urllib_request,  )  from ..utils import ( -    check_executable,      encodeFilename,  ) @@ -27,16 +26,13 @@ class HlsFD(FileDownloader):              '-bsf:a', 'aac_adtstoasc',              encodeFilename(tmpfilename, for_subprocess=True)] -        for program in ['avconv', 'ffmpeg']: -            if check_executable(program, ['-version']): -                break -        else: +        ffpp = FFmpegPostProcessor(downloader=self) +        program = ffpp._executable +        if program is None:              self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.')              return False -        cmd = [program] + args - -        ffpp = FFmpegPostProcessor(downloader=self)          ffpp.check_version() +        cmd = [program] + args          retval = subprocess.call(cmd)          if retval == 0: diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py index c53195da0..72cef30ea 100644 --- a/youtube_dl/downloader/mplayer.py +++ b/youtube_dl/downloader/mplayer.py @@ -4,8 +4,8 @@ import os  import subprocess  from .common import FileDownloader -from ..compat import compat_subprocess_get_DEVNULL  from ..utils import ( +    check_executable,      encodeFilename,  ) @@ -20,11 +20,7 @@ class MplayerFD(FileDownloader):              'mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy',              '-dumpstream', '-dumpfile', tmpfilename, url]          # Check for mplayer first -        try: -            subprocess.call( -                ['mplayer', '-h'], -                stdout=compat_subprocess_get_DEVNULL(), stderr=subprocess.STDOUT) -        except (OSError, IOError): +        if not check_executable('mplayer', ['-h']):              self.report_error('MMS or RTSP download detected but "%s" could not be run' % args[0])              return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e4c51f238..f544e87f1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -71,6 +71,7 @@ from .cnn import (  from .collegehumor import CollegeHumorIE  from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE  from .comcarcoff import ComCarCoffIE +from .commonmistakes import CommonMistakesIE  from .condenast import CondeNastIE  from .cracked import CrackedIE  from .criterion import CriterionIE @@ -158,6 +159,7 @@ from .gametrailers import GametrailersIE  from .gdcvault import GDCVaultIE  from .generic import GenericIE  from .giantbomb import GiantBombIE +from .giga import GigaIE  from .glide import GlideIE  from .globo import GloboIE  from .godtube import GodTubeIE @@ -272,6 +274,7 @@ from .nbc import (  )  from .ndr import NDRIE  from .ndtv import NDTVIE +from .netzkino import NetzkinoIE  from .nerdcubed import NerdCubedFeedIE  from .newgrounds import NewgroundsIE  from .newstube import NewstubeIE @@ -324,6 +327,7 @@ from .prosiebensat1 import ProSiebenSat1IE  from .pyvideo import PyvideoIE  from .quickvid import QuickVidIE  from .radiode import RadioDeIE +from .radiobremen import RadioBremenIE  from .radiofrance import RadioFranceIE  from .rai import RaiIE  from .rbmaradio import RBMARadioIE @@ -344,6 +348,7 @@ from .ruhd import RUHDIE  from .rutube import (      RutubeIE,      RutubeChannelIE, +    RutubeEmbedIE,      RutubeMovieIE,      RutubePersonIE,  ) @@ -473,6 +478,7 @@ from .videott import VideoTtIE  from .videoweed import VideoWeedIE  from .vidme import VidmeIE  from .vidzi import VidziIE +from .vier import VierIE, VierVideosIE  from .vimeo import (      VimeoIE,      VimeoAlbumIE, @@ -508,6 +514,7 @@ from .wdr import (      WDRMobileIE,      WDRMausIE,  ) +from .webofstories import WebOfStoriesIE  from .weibo import WeiboIE  from .wimp import WimpIE  from .wistia import WistiaIE @@ -543,7 +550,7 @@ from .youtube import (      YoutubeSearchURLIE,      YoutubeShowIE,      YoutubeSubscriptionsIE, -    YoutubeTopListIE, +    YoutubeTruncatedIDIE,      YoutubeTruncatedURLIE,      YoutubeUserIE,      YoutubeWatchLaterIE, @@ -569,6 +576,17 @@ def gen_extractors():      return [klass() for klass in _ALL_CLASSES] +def list_extractors(age_limit): +    """ +    Return a list of extractors that are suitable for the given age, +    sorted by extractor ID. +    """ + +    return sorted( +        filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), +        key=lambda ie: ie.IE_NAME.lower()) + +  def get_info_extractor(ie_name):      """Returns the info extractor class with the given ie_name"""      return globals()[ie_name + 'IE'] diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index 014a21952..a1b666be0 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse  from ..utils import (      determine_ext,      ExtractorError, +    remove_end,  ) @@ -27,23 +28,18 @@ class AUEngineIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title') -        title = title.strip() -        links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage) -        links = map(compat_urllib_parse.unquote, links) - -        thumbnail = None -        video_url = None -        for link in links: -            if link.endswith('.png'): -                thumbnail = link -            elif '/videos/' in link: -                video_url = link +        title = self._html_search_regex( +            r'<title>\s*(?P<title>.+?)\s*</title>', webpage, 'title') +        video_urls = re.findall(r'http://\w+.auengine.com/vod/.*[^\W]', webpage) +        video_url = compat_urllib_parse.unquote(video_urls[0]) +        thumbnails = re.findall(r'http://\w+.auengine.com/thumb/.*[^\W]', webpage) +        thumbnail = compat_urllib_parse.unquote(thumbnails[0]) +          if not video_url:              raise ExtractorError('Could not find video URL') +          ext = '.' + determine_ext(video_url) -        if ext == title[-len(ext):]: -            title = title[:-len(ext)] +        title = remove_end(title, ext)          return {              'id': video_id, diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f690dc803..1cf48fe0d 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -10,7 +10,7 @@ from ..compat import compat_HTTPError  class BBCCoUkIE(SubtitlesInfoExtractor):      IE_NAME = 'bbc.co.uk'      IE_DESC = 'BBC iPlayer' -    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})' +    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer/(?:episode|playlist))/)|music/clips[/#])(?P<id>[\da-z]{8})'      _TESTS = [          { @@ -18,8 +18,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor):              'info_dict': {                  'id': 'b039d07m',                  'ext': 'flv', -                'title': 'Kaleidoscope: Leonard Cohen', -                'description': 'md5:db4755d7a665ae72343779f7dacb402c', +                'title': 'Kaleidoscope, Leonard Cohen', +                'description': 'The Canadian poet and songwriter reflects on his musical career.',                  'duration': 1740,              },              'params': { @@ -84,6 +84,40 @@ class BBCCoUkIE(SubtitlesInfoExtractor):                  # rtmp download                  'skip_download': True,              } +        }, { +            'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', +            'note': 'Audio', +            'info_dict': { +                'id': 'p02frcch', +                'ext': 'flv', +                'title': 'Pete Tong, Past, Present and Future Special, Madeon - After Hours mix', +                'description': 'French house superstar Madeon takes us out of the club and onto the after party.', +                'duration': 3507, +            }, +            'params': { +                # rtmp download +                'skip_download': True, +            } +        }, { +            'url': 'http://www.bbc.co.uk/music/clips/p025c0zz', +            'note': 'Video', +            'info_dict': { +                'id': 'p025c103', +                'ext': 'flv', +                'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', +                'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', +                'duration': 226, +            }, +            'params': { +                # rtmp download +                'skip_download': True, +            } +        }, { +            'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', +            'only_matching': True, +        }, { +            'url': 'http://www.bbc.co.uk/music/clips#p02frcc3', +            'only_matching': True,          }      ] @@ -241,8 +275,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor):          # fallback to legacy playlist          playlist = self._download_xml( -                'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, -                playlist_id, 'Downloading legacy playlist XML') +            'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, +            playlist_id, 'Downloading legacy playlist XML')          no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')          if no_items is not None: diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 003e50002..d2abd4d77 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -16,7 +16,7 @@ class BetIE(InfoExtractor):          {              'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html',              'info_dict': { -                'id': '417cd61c-c793-4e8e-b006-e445ecc45add', +                'id': '740ab250-bb94-4a8a-8787-fe0de7c74471',                  'display_id': 'in-bet-exclusive-obama-talks-race-and-racism',                  'ext': 'flv',                  'title': 'BET News Presents: A Conversation With President Obama', @@ -35,7 +35,7 @@ class BetIE(InfoExtractor):          {              'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html',              'info_dict': { -                'id': '4160e53b-ad41-43b1-980f-8d85f63121f4', +                'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d',                  'display_id': 'justice-for-ferguson-a-community-reacts',                  'ext': 'flv',                  'title': 'Justice for Ferguson: A Community Reacts', @@ -55,7 +55,6 @@ class BetIE(InfoExtractor):      def _real_extract(self, url):          display_id = self._match_id(url) -          webpage = self._download_webpage(url, display_id)          media_url = compat_urllib_parse.unquote(self._search_regex( diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 241b904a9..75d744852 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,7 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_parse_qs  from ..utils import ( -    ExtractorError,      int_or_none,      unified_strdate,  ) @@ -54,45 +52,38 @@ class BiliBiliIE(InfoExtractor):          thumbnail = self._html_search_meta(              'thumbnailUrl', video_code, 'thumbnail', fatal=False) -        player_params = compat_parse_qs(self._html_search_regex( -            r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"', -            webpage, 'player params')) +        cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') -        if 'cid' in player_params: -            cid = player_params['cid'][0] +        lq_doc = self._download_xml( +            'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, +            video_id, +            note='Downloading LQ video info' +        ) +        lq_durl = lq_doc.find('./durl') +        formats = [{ +            'format_id': 'lq', +            'quality': 1, +            'url': lq_durl.find('./url').text, +            'filesize': int_or_none( +                lq_durl.find('./size'), get_attr='text'), +        }] -            lq_doc = self._download_xml( -                'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid, -                video_id, -                note='Downloading LQ video info' -            ) -            lq_durl = lq_doc.find('.//durl') -            formats = [{ -                'format_id': 'lq', -                'quality': 1, -                'url': lq_durl.find('./url').text, +        hq_doc = self._download_xml( +            'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, +            video_id, +            note='Downloading HQ video info', +            fatal=False, +        ) +        if hq_doc is not False: +            hq_durl = hq_doc.find('./durl') +            formats.append({ +                'format_id': 'hq', +                'quality': 2, +                'ext': 'flv', +                'url': hq_durl.find('./url').text,                  'filesize': int_or_none( -                    lq_durl.find('./size'), get_attr='text'), -            }] - -            hq_doc = self._download_xml( -                'http://interface.bilibili.cn/playurl?cid=%s' % cid, -                video_id, -                note='Downloading HQ video info', -                fatal=False, -            ) -            if hq_doc is not False: -                hq_durl = hq_doc.find('.//durl') -                formats.append({ -                    'format_id': 'hq', -                    'quality': 2, -                    'ext': 'flv', -                    'url': hq_durl.find('./url').text, -                    'filesize': int_or_none( -                        hq_durl.find('./size'), get_attr='text'), -                }) -        else: -            raise ExtractorError('Unsupported player parameters: %r' % (player_params,)) +                    hq_durl.find('./size'), get_attr='text'), +            })          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index a40a1bbc4..a5d2af174 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -33,7 +33,7 @@ class BuzzFeedIE(InfoExtractor):              'skip_download': True,  # Got enough YouTube download tests          },          'info_dict': { -            'description': 'Munchkin the Teddy Bear is back !', +            'description': 're:Munchkin the Teddy Bear is back ?!',              'title': 'You Need To Stop What You\'re Doing And Watching This Dog Walk On A Treadmill',          },          'playlist': [{ @@ -42,9 +42,9 @@ class BuzzFeedIE(InfoExtractor):                  'ext': 'mp4',                  'upload_date': '20141124',                  'uploader_id': 'CindysMunchkin', -                'description': '© 2014 Munchkin the Shih Tzu\nAll rights reserved\nFacebook: http://facebook.com/MunchkintheShihTzu', +                'description': 're:© 2014 Munchkin the Shih Tzu',                  'uploader': 'Munchkin the Shih Tzu', -                'title': 'Munchkin the Teddy Bear gets her exercise', +                'title': 're:Munchkin the Teddy Bear gets her exercise',              },          }]      }] diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 9873728df..11d18d74a 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -5,6 +5,8 @@ import re  from .common import InfoExtractor  from ..utils import ( +    ExtractorError, +    HEADRequest,      unified_strdate,      url_basename,      qualities, @@ -76,6 +78,16 @@ class CanalplusIE(InfoExtractor):          preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS']) +        fmt_url = next(iter(media.find('VIDEOS'))).text +        if '/geo' in fmt_url.lower(): +            response = self._request_webpage( +                HEADRequest(fmt_url), video_id, +                'Checking if the video is georestricted') +            if '/blocage' in response.geturl(): +                raise ExtractorError( +                    'The video is not available in your country', +                    expected=True) +          formats = []          for fmt in media.find('VIDEOS'):              format_url = fmt.text diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 2f866f3ef..f70e090bb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals  import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor  from ..compat import (      compat_urllib_request,      compat_urllib_parse, @@ -11,49 +11,42 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    float_or_none,  ) -class CeskaTelevizeIE(InfoExtractor): +class CeskaTelevizeIE(SubtitlesInfoExtractor):      _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'      _TESTS = [          { -            'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', +            'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',              'info_dict': { -                'id': '213512120230004', -                'ext': 'flv', -                'title': 'První republika: Španělská chřipka', -                'duration': 3107.4, +                'id': '214411058091220', +                'ext': 'mp4', +                'title': 'Hyde Park Civilizace', +                'description': 'Věda a současná civilizace. Interaktivní pořad - prostor pro vaše otázky a komentáře', +                'thumbnail': 're:^https?://.*\.jpg', +                'duration': 3350,              },              'params': { -                'skip_download': True,  # requires rtmpdump +                # m3u8 download +                'skip_download': True,              }, -            'skip': 'Works only from Czech Republic.', -        }, -        { -            'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', -            'info_dict': { -                'id': '20138143440', -                'ext': 'flv', -                'title': 'Tsatsiki, maminka a policajt', -                'duration': 6754.1, -            }, -            'params': { -                'skip_download': True,  # requires rtmpdump -            }, -            'skip': 'Works only from Czech Republic.',          },          {              'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',              'info_dict': {                  'id': '14716', -                'ext': 'flv', +                'ext': 'mp4',                  'title': 'První republika: Zpěvačka z Dupárny Bobina', -                'duration': 90, +                'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', +                'thumbnail': 're:^https?://.*\.jpg', +                'duration': 88.4,              },              'params': { -                'skip_download': True,  # requires rtmpdump +                # m3u8 download +                'skip_download': True,              },          },      ] @@ -80,8 +73,9 @@ class CeskaTelevizeIE(InfoExtractor):              'requestSource': 'iVysilani',          } -        req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', -                                            data=compat_urllib_parse.urlencode(data)) +        req = compat_urllib_request.Request( +            'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', +            data=compat_urllib_parse.urlencode(data))          req.add_header('Content-type', 'application/x-www-form-urlencoded')          req.add_header('x-addr', '127.0.0.1') @@ -90,39 +84,72 @@ class CeskaTelevizeIE(InfoExtractor):          playlistpage = self._download_json(req, video_id) -        req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url'])) +        playlist_url = playlistpage['url'] +        if playlist_url == 'error_region': +            raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + +        req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url))          req.add_header('Referer', url) -        playlist = self._download_xml(req, video_id) +        playlist = self._download_json(req, video_id) +        item = playlist['playlist'][0]          formats = [] -        for i in playlist.find('smilRoot/body'): -            if 'AD' not in i.attrib['id']: -                base_url = i.attrib['base'] -                parsedurl = compat_urllib_parse_urlparse(base_url) -                duration = i.attrib['duration'] - -                for video in i.findall('video'): -                    if video.attrib['label'] != 'AD': -                        format_id = video.attrib['label'] -                        play_path = video.attrib['src'] -                        vbr = int(video.attrib['system-bitrate']) - -                        formats.append({ -                            'format_id': format_id, -                            'url': base_url, -                            'vbr': vbr, -                            'play_path': play_path, -                            'app': parsedurl.path[1:] + '?' + parsedurl.query, -                            'rtmp_live': True, -                            'ext': 'flv', -                        }) - +        for format_id, stream_url in item['streamUrls'].items(): +            formats.extend(self._extract_m3u8_formats(stream_url, video_id, 'mp4'))          self._sort_formats(formats) +        title = self._og_search_title(webpage) +        description = self._og_search_description(webpage) +        duration = float_or_none(item.get('duration')) +        thumbnail = item.get('previewImageUrl') + +        subtitles = {} +        subs = item.get('subtitles') +        if subs: +            subtitles['cs'] = subs[0]['url'] + +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, subtitles) +            return + +        subtitles = self._fix_subtitles(self.extract_subtitles(video_id, subtitles)) +          return {              'id': episode_id, -            'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'), -            'duration': float(duration), +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration,              'formats': formats, +            'subtitles': subtitles,          } + +    @staticmethod +    def _fix_subtitles(subtitles): +        """ Convert millisecond-based subtitles to SRT """ +        if subtitles is None: +            return subtitles  # subtitles not requested + +        def _msectotimecode(msec): +            """ Helper utility to convert milliseconds to timecode """ +            components = [] +            for divider in [1000, 60, 60, 100]: +                components.append(msec % divider) +                msec //= divider +            return "{3:02}:{2:02}:{1:02},{0:03}".format(*components) + +        def _fix_subtitle(subtitle): +            for line in subtitle.splitlines(): +                m = re.match(r"^\s*([0-9]+);\s*([0-9]+)\s+([0-9]+)\s*$", line) +                if m: +                    yield m.group(1) +                    start, stop = (_msectotimecode(int(t)) for t in m.groups()[1:]) +                    yield "{0} --> {1}".format(start, stop) +                else: +                    yield line + +        fixed_subtitles = {} +        for k, v in subtitles.items(): +            fixed_subtitles[k] = "\r\n".join(_fix_subtitle(v)) +        return fixed_subtitles diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6e264f687..b4cd59e43 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -21,6 +21,7 @@ from ..compat import (      compat_str,  )  from ..utils import ( +    age_restricted,      clean_html,      compiled_regex_type,      ExtractorError, @@ -92,6 +93,8 @@ class InfoExtractor(object):                                   by this field, regardless of all other values.                                   -1 for default (order by other properties),                                   -2 or smaller for less than default. +                                 < -1000 to hide the format (if there is +                                    another one which is strictly better)                      * language_preference  Is this in the correct requested                                   language?                                   10 if it's what the URL is about, @@ -144,6 +147,17 @@ class InfoExtractor(object):      like_count:     Number of positive ratings of the video      dislike_count:  Number of negative ratings of the video      comment_count:  Number of comments on the video +    comments:       A list of comments, each with one or more of the following +                    properties (all but one of text or html optional): +                        * "author" - human-readable name of the comment author +                        * "author_id" - user ID of the comment author +                        * "id" - Comment ID +                        * "html" - Comment as HTML +                        * "text" - Plain text of the comment +                        * "timestamp" - UNIX timestamp of comment +                        * "parent" - ID of the comment this one is replying to. +                                     Set to "root" to indicate that this is a +                                     comment to the original video.      age_limit:      Age restriction for the video, as an integer (years)      webpage_url:    The url to the video webpage, if given to youtube-dl it                      should allow to get the same result again. (It will be set @@ -591,7 +605,7 @@ class InfoExtractor(object):          return self._html_search_regex(              r'''(?isx)<meta                      (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) -                    [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name), +                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),              html, display_name, fatal=fatal, group='content', **kwargs)      def _dc_search_uploader(self, html): @@ -875,6 +889,35 @@ class InfoExtractor(object):              None, '/', True, False, expire_time, '', None, None, None)          self._downloader.cookiejar.set_cookie(cookie) +    def get_testcases(self, include_onlymatching=False): +        t = getattr(self, '_TEST', None) +        if t: +            assert not hasattr(self, '_TESTS'), \ +                '%s has _TEST and _TESTS' % type(self).__name__ +            tests = [t] +        else: +            tests = getattr(self, '_TESTS', []) +        for t in tests: +            if not include_onlymatching and t.get('only_matching', False): +                continue +            t['name'] = type(self).__name__[:-len('IE')] +            yield t + +    def is_suitable(self, age_limit): +        """ Test whether the extractor is generally suitable for the given +        age limit (i.e. pornographic sites are not, all others usually are) """ + +        any_restricted = False +        for tc in self.get_testcases(include_onlymatching=False): +            if 'playlist' in tc: +                tc = tc['playlist'][0] +            is_restricted = age_restricted( +                tc.get('info_dict', {}).get('age_limit'), age_limit) +            if not is_restricted: +                return True +            any_restricted = any_restricted or is_restricted +        return not any_restricted +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/commonmistakes.py b/youtube_dl/extractor/commonmistakes.py new file mode 100644 index 000000000..75c06903f --- /dev/null +++ b/youtube_dl/extractor/commonmistakes.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class CommonMistakesIE(InfoExtractor): +    IE_DESC = False  # Do not list +    _VALID_URL = r'''(?x) +        (?:url|URL) +    ''' + +    _TESTS = [{ +        'url': 'url', +        'only_matching': True, +    }, { +        'url': 'URL', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        msg = ( +            'You\'ve asked youtube-dl to download the URL "%s". ' +            'That doesn\'t make any sense. ' +            'Simply remove the parameter in your command or configuration.' +        ) % url +        if self._downloader.params.get('verbose'): +            msg += ' Add -v to the command line to see what arguments and configuration youtube-dl got.' +        raise ExtractorError(msg, expected=True) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 354046a9e..1680f532f 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -228,7 +228,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text          video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)          formats = [] -        for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage): +        for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):              stream_quality, stream_format = self._FORMAT_IDS[fmt]              video_format = fmt + 'p'              streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 52c2d7ddf..d3e667528 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,47 +1,45 @@  from __future__ import unicode_literals -import re -import json -  from .common import InfoExtractor +from ..utils import ( +    parse_iso8601, +    int_or_none, +)  class DiscoveryIE(InfoExtractor): -    _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' +    _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?'      _TEST = {          'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', -        'md5': 'e12614f9ee303a6ccef415cb0793eba2', +        'md5': '3c69d77d9b0d82bfd5e5932a60f26504',          'info_dict': { -            'id': '614784', -            'ext': 'mp4', -            'title': 'MythBusters: Mission Impossible Outtakes', +            'id': 'mission-impossible-outtakes', +            'ext': 'flv', +            'title': 'Mission Impossible Outtakes',              'description': ('Watch Jamie Hyneman and Adam Savage practice being'                              ' each other -- to the point of confusing Jamie\'s dog -- and '                              'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'                              ' back.'),              'duration': 156, +            'timestamp': 1303099200, +            'upload_date': '20110418',          },      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        video_list_json = self._search_regex(r'var videoListJSON = ({.*?});', -                                             webpage, 'video list', flags=re.DOTALL) -        video_list = json.loads(video_list_json) -        info = video_list['clips'][0] -        formats = [] -        for f in info['mp4']: -            formats.append( -                {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])}) +        info = self._parse_json(self._search_regex( +            r'(?s)<script type="application/ld\+json">(.*?)</script>', +            webpage, 'video info'), video_id)          return { -            'id': info['contentId'], -            'title': video_list['name'], -            'formats': formats, -            'description': info['videoCaption'], -            'thumbnail': info.get('videoStillURL') or info.get('thumbnailURL'), -            'duration': info['duration'], +            'id': video_id, +            'title': info['name'], +            'url': info['contentURL'], +            'description': info.get('description'), +            'thumbnail': info.get('thumbnailUrl'), +            'timestamp': parse_iso8601(info.get('uploadDate')), +            'duration': int_or_none(info.get('duration')),          } diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 3e7923648..fc92ff825 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -1,7 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import re  import json  from .common import InfoExtractor @@ -12,32 +11,49 @@ from ..utils import (  class EllenTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ellentv\.com/videos/(?P<id>[a-z0-9_-]+)' -    _TEST = { +    _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' +    _TESTS = [{          'url': 'http://www.ellentv.com/videos/0-7jqrsr18/',          'md5': 'e4af06f3bf0d5f471921a18db5764642',          'info_dict': {              'id': '0-7jqrsr18',              'ext': 'mp4',              'title': 'What\'s Wrong with These Photos? A Whole Lot', +            'description': 'md5:35f152dc66b587cf13e6d2cf4fa467f6',              'timestamp': 1406876400,              'upload_date': '20140801',          } -    } +    }, { +        'url': 'http://ellentube.com/videos/0-dvzmabd5/', +        'md5': '98238118eaa2bbdf6ad7f708e3e4f4eb', +        'info_dict': { +            'id': '0-dvzmabd5', +            'ext': 'mp4', +            'title': '1 year old twin sister makes her brother laugh', +            'description': '1 year old twin sister makes her brother laugh', +            'timestamp': 1419542075, +            'upload_date': '20141225', +        } +    }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) +        video_url = self._html_search_meta('VideoURL', webpage, 'url') +        title = self._og_search_title(webpage, default=None) or self._search_regex( +            r'pageName\s*=\s*"([^"]+)"', webpage, 'title') +        description = self._html_search_meta( +            'description', webpage, 'description') or self._og_search_description(webpage)          timestamp = parse_iso8601(self._search_regex(              r'<span class="publish-date"><time datetime="([^"]+)">',              webpage, 'timestamp'))          return {              'id': video_id, -            'title': self._og_search_title(webpage), -            'url': self._html_search_meta('VideoURL', webpage, 'url'), +            'url': video_url, +            'title': title, +            'description': description,              'timestamp': timestamp,          } @@ -55,8 +71,7 @@ class EllenTVClipsIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        playlist_id = mobj.group('id') +        playlist_id = self._match_id(url)          webpage = self._download_webpage(url, playlist_id)          playlist = self._extract_playlist(webpage) diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 4277202a2..00a69e631 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -1,8 +1,6 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  from ..utils import unified_strdate @@ -24,9 +22,7 @@ class ElPaisIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          prefix = self._html_search_regex( diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index d09d1c13a..190d9f9ad 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -13,7 +13,7 @@ from ..utils import (  class FKTVIE(InfoExtractor):      IE_NAME = 'fernsehkritik.tv' -    _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<ep>[0-9]+)(?:/.*)?' +    _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?'      _TEST = {          'url': 'http://fernsehkritik.tv/folge-1', @@ -26,29 +26,32 @@ class FKTVIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        episode = int(mobj.group('ep')) +        episode = int(self._match_id(url)) -        server = random.randint(2, 4) -        video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%d.jpg' % episode -        start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%d/Start' % episode, +        video_thumbnail = 'http://fernsehkritik.tv/images/magazin/folge%s.jpg' % episode +        start_webpage = self._download_webpage('http://fernsehkritik.tv/folge-%s/Start' % episode,                                                 episode)          playlist = self._search_regex(r'playlist = (\[.*?\]);', start_webpage,                                        'playlist', flags=re.DOTALL)          files = json.loads(re.sub('{[^{}]*?}', '{}', playlist)) -        # TODO: return a single multipart video +          videos = []          for i, _ in enumerate(files, 1):              video_id = '%04d%d' % (episode, i) -            video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i) +            video_url = 'http://fernsehkritik.tv/js/directme.php?file=%s%s.flv' % (episode, '' if i == 1 else '-%d' % i)              videos.append({ +                'ext': 'flv',                  'id': video_id,                  'url': video_url,                  'title': clean_html(get_element_by_id('eptitle', start_webpage)),                  'description': clean_html(get_element_by_id('contentlist', start_webpage)),                  'thumbnail': video_thumbnail              }) -        return videos +        return { +            '_type': 'multi_video', +            'entries': videos, +            'id': 'folge-%s' % episode, +        }  class FKTVPosteckeIE(InfoExtractor): diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py index 75f180928..a07d69841 100644 --- a/youtube_dl/extractor/gameone.py +++ b/youtube_dl/extractor/gameone.py @@ -57,8 +57,7 @@ class GameOneIE(InfoExtractor):      ]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          og_video = self._og_search_video_url(webpage, secure=False) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index d453ec010..fed968f51 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -39,7 +39,8 @@ class GDCVaultIE(InfoExtractor):                  'id': '1015301',                  'ext': 'flv',                  'title': 'Thexder Meets Windows 95, or Writing Great Games in the Windows 95 Environment', -            } +            }, +            'skip': 'Requires login',          }      ] diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40b2791c7..7a5bf9392 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -131,12 +131,13 @@ class GenericIE(InfoExtractor):          # ooyala video          {              'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', -            'md5': '5644c6ca5d5782c1d0d350dad9bd840c', +            'md5': '166dd577b433b4d4ebfee10b0824d8ff',              'info_dict': {                  'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',                  'ext': 'mp4',                  'title': '2cc213299525360.mov',  # that's what we get              }, +            'add_ie': ['Ooyala'],          },          # google redirect          { @@ -146,7 +147,7 @@ class GenericIE(InfoExtractor):                  'ext': 'mp4',                  'upload_date': '20130224',                  'uploader_id': 'TheVerge', -                'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.', +                'description': 're:^Chris Ziegler takes a look at the\.*',                  'uploader': 'The Verge',                  'title': 'First Firefox OS phones side-by-side',              }, @@ -181,6 +182,14 @@ class GenericIE(InfoExtractor):                  'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',              },          }, +        # BBC iPlayer embeds +        { +            'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', +            'info_dict': { +                'title': 'BBC - Blogs -  Adam Curtis - BUGGER', +            }, +            'playlist_mincount': 18, +        },          # RUTV embed          {              'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', @@ -699,9 +708,9 @@ class GenericIE(InfoExtractor):              r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')          # Helper method -        def _playlist_from_matches(matches, getter, ie=None): +        def _playlist_from_matches(matches, getter=None, ie=None):              urlrs = orderedSet( -                self.url_result(self._proto_relative_url(getter(m)), ie) +                self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)                  for m in matches)              return self.playlist_result(                  urlrs, playlist_id=video_id, playlist_title=video_title) @@ -905,6 +914,11 @@ class GenericIE(InfoExtractor):              return _playlist_from_matches(                  matches, getter=unescapeHTML, ie='FunnyOrDie') +        # Look for BBC iPlayer embed +        matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) +        if matches: +            return _playlist_from_matches(matches, ie='BBCCoUk') +          # Look for embedded RUTV player          rutv_url = RUTVIE._extract_url(webpage)          if rutv_url: @@ -912,7 +926,7 @@ class GenericIE(InfoExtractor):          # Look for embedded TED player          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)          if mobj is not None:              return self.url_result(mobj.group('url'), 'TED') diff --git a/youtube_dl/extractor/giga.py b/youtube_dl/extractor/giga.py new file mode 100644 index 000000000..775890112 --- /dev/null +++ b/youtube_dl/extractor/giga.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( +    qualities, +    compat_str, +    parse_duration, +    parse_iso8601, +    str_to_int, +) + + +class GigaIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?giga\.de/(?:[^/]+/)*(?P<id>[^/]+)' +    _TESTS = [{ +        'url': 'http://www.giga.de/filme/anime-awesome/trailer/anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss/', +        'md5': '6bc5535e945e724640664632055a584f', +        'info_dict': { +            'id': '2622086', +            'display_id': 'anime-awesome-chihiros-reise-ins-zauberland-das-beste-kommt-zum-schluss', +            'ext': 'mp4', +            'title': 'Anime Awesome: Chihiros Reise ins Zauberland – Das Beste kommt zum Schluss', +            'description': 'md5:afdf5862241aded4718a30dff6a57baf', +            'thumbnail': 're:^https?://.*\.jpg$', +            'duration': 578, +            'timestamp': 1414749706, +            'upload_date': '20141031', +            'uploader': 'Robin Schweiger', +            'view_count': int, +        }, +    }, { +        'url': 'http://www.giga.de/games/channel/giga-top-montag/giga-topmontag-die-besten-serien-2014/', +        'only_matching': True, +    }, { +        'url': 'http://www.giga.de/extra/netzkultur/videos/giga-games-tom-mats-robin-werden-eigene-wege-gehen-eine-ankuendigung/', +        'only_matching': True, +    }, { +        'url': 'http://www.giga.de/tv/jonas-liest-spieletitel-eingedeutscht-episode-2/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._search_regex( +            [r'data-video-id="(\d+)"', r'/api/video/jwplayer/#v=(\d+)'], +            webpage, 'video id') + +        playlist = self._download_json( +            'http://www.giga.de/api/syndication/video/video_id/%s/playlist.json?content=syndication/key/368b5f151da4ae05ced7fa296bdff65a/' +            % video_id, video_id)[0] + +        quality = qualities(['normal', 'hd720']) + +        formats = [] +        for format_id in itertools.count(0): +            fmt = playlist.get(compat_str(format_id)) +            if not fmt: +                break +            formats.append({ +                'url': fmt['src'], +                'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), +                'quality': quality(fmt['quality']), +            }) +        self._sort_formats(formats) + +        title = self._html_search_meta( +            'title', webpage, 'title', fatal=True) +        description = self._html_search_meta( +            'description', webpage, 'description') +        thumbnail = self._og_search_thumbnail(webpage) + +        duration = parse_duration(self._search_regex( +            r'(?s)(?:data-video-id="{0}"|data-video="[^"]*/api/video/jwplayer/#v={0}[^"]*")[^>]*>.+?<span class="duration">([^<]+)</span>'.format(video_id), +            webpage, 'duration', fatal=False)) + +        timestamp = parse_iso8601(self._search_regex( +            r'datetime="([^"]+)"', webpage, 'upload date', fatal=False)) +        uploader = self._search_regex( +            r'class="author">([^<]+)</a>', webpage, 'uploader', fatal=False) + +        view_count = str_to_int(self._search_regex( +            r'<span class="views"><strong>([\d.]+)</strong>', webpage, 'view count', fatal=False)) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': timestamp, +            'uploader': uploader, +            'view_count': view_count, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 4ccf6b9b8..a38eae421 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -39,8 +39,9 @@ class HuffPostIE(InfoExtractor):          data = self._download_json(api_url, video_id)['data']          video_title = data['title'] -        duration = parse_duration(data['running_time']) -        upload_date = unified_strdate(data['schedule']['starts_at']) +        duration = parse_duration(data.get('running_time')) +        upload_date = unified_strdate( +            data.get('schedule', {}).get('starts_at') or data.get('segment_start_date_time'))          description = data.get('description')          thumbnails = [] @@ -59,16 +60,11 @@ class HuffPostIE(InfoExtractor):              'ext': 'mp4',              'url': url,              'vcodec': 'none' if key.startswith('audio/') else None, -        } for key, url in data['sources']['live'].items()] -        if data.get('fivemin_id'): -            fid = data['fivemin_id'] -            fcat = str(int(fid) // 100 + 1) -            furl = 'http://avideos.5min.com/2/' + fcat[-3:] + '/' + fcat + '/' + fid + '.mp4' -            formats.append({ -                'format': 'fivemin', -                'url': furl, -                'preference': 1, -            }) +        } for key, url in data.get('sources', {}).get('live', {}).items()] + +        if not formats and data.get('fivemin_id'): +            return self.url_result('5min:%s' % data['fivemin_id']) +          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 13a53a0cb..f29df36b5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -16,7 +16,6 @@ class ImdbIE(InfoExtractor):      _TEST = {          'url': 'http://www.imdb.com/video/imdb/vi2524815897', -        'md5': '9f34fa777ade3a6e57a054fdbcb3a068',          'info_dict': {              'id': '2524815897',              'ext': 'mp4', diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py index 408d00944..08a671fa8 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/youtube_dl/extractor/khanacademy.py @@ -22,8 +22,10 @@ class KhanAcademyIE(InfoExtractor):              'description': 'The perfect cipher',              'duration': 176,              'uploader': 'Brit Cruise', +            'uploader_id': 'khanacademy',              'upload_date': '20120411', -        } +        }, +        'add_ie': ['Youtube'],      }, {          'url': 'https://www.khanacademy.org/math/applied-math/cryptography',          'info_dict': { diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 41fd62009..720bc939b 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -10,13 +10,14 @@ from ..utils import int_or_none  class KontrTubeIE(InfoExtractor):      IE_NAME = 'kontrtube'      IE_DESC = 'KontrTube.ru - Труба зовёт' -    _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/.+' +    _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/'      _TEST = {          'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',          'md5': '975a991a4926c9a85f383a736a2e6b80',          'info_dict': {              'id': '2678', +            'display_id': 'nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag',              'ext': 'mp4',              'title': 'Над олимпийской деревней в Сочи поднят российский флаг',              'description': 'md5:80edc4c613d5887ae8ccf1d59432be41', @@ -28,21 +29,28 @@ class KontrTubeIE(InfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        display_id = mobj.group('display_id') -        webpage = self._download_webpage(url, video_id, 'Downloading page') +        webpage = self._download_webpage( +            url, display_id, 'Downloading page') -        video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL') -        thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False) +        video_url = self._html_search_regex( +            r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL') +        thumbnail = self._html_search_regex( +            r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False)          title = self._html_search_regex(              r'<title>(.+?)</title>', webpage, 'video title') -        description = self._html_search_meta('description', webpage, 'video description') +        description = self._html_search_meta( +            'description', webpage, 'video description')          mobj = re.search( -            r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage) +            r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', +            webpage)          duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None          view_count = self._html_search_regex( -            r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False) +            r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', +            webpage, 'view count', fatal=False)          comment_count = None          comment_str = self._html_search_regex( @@ -56,6 +64,7 @@ class KontrTubeIE(InfoExtractor):          return {              'id': video_id, +            'display_id': display_id,              'url': video_url,              'thumbnail': thumbnail,              'title': title, diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index d72d470aa..9c2fbdd96 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -2,7 +2,6 @@  from __future__ import unicode_literals  import re -import json  from .common import InfoExtractor  from ..utils import ( @@ -28,7 +27,6 @@ class LRTIE(InfoExtractor):          'params': {              'skip_download': True,  # HLS download          }, -      }      def _real_extract(self, url): @@ -44,7 +42,9 @@ class LRTIE(InfoExtractor):          formats = []          for js in re.findall(r'(?s)config:\s*(\{.*?\})', webpage): -            data = json.loads(js_to_json(js)) +            data = self._parse_json(js, video_id, transform_source=js_to_json) +            if 'provider' not in data: +                continue              if data['provider'] == 'rtmp':                  formats.append({                      'format_id': 'rtmp', diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 78787e8f1..3c61a850f 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -105,6 +105,9 @@ class OCWMITIE(InfoExtractor):                  'ext': 'mp4',                  'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',                  'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.', +                'upload_date': '20121109', +                'uploader_id': 'MIT', +                'uploader': 'MIT OpenCourseWare',                  # 'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'              }          }, @@ -114,6 +117,9 @@ class OCWMITIE(InfoExtractor):                  'id': '7K1sB05pE0A',                  'ext': 'mp4',                  'title': 'Session 1: Introduction to Derivatives', +                'upload_date': '20090818', +                'uploader_id': 'MIT', +                'uploader': 'MIT OpenCourseWare',                  'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',                  # 'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'              } diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index f5ca74e97..c1a482dba 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -1,63 +1,49 @@  # coding: utf-8  from __future__ import unicode_literals -import hashlib -import json -import time -  from .common import InfoExtractor  from ..compat import ( -    compat_parse_qs, -    compat_str, -) -from ..utils import ( -    int_or_none, +    compat_urlparse,  )  class MotorsportIE(InfoExtractor):      IE_DESC = 'motorsport.com' -    _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])' +    _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])'      _TEST = {          'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', -        'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',          'info_dict': { -            'id': '7063', +            'id': '2-T3WuR-KMM',              'ext': 'mp4',              'title': 'Red Bull Racing: 2014 Rules Explained', -            'duration': 207, +            'duration': 208,              'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.', -            'uploader': 'rainiere', -            'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$' -        } +            'uploader': 'mcomstaff', +            'uploader_id': 'UC334JIYKkVnyFoNCclfZtHQ', +            'upload_date': '20140903', +            'thumbnail': r're:^https?://.+\.jpg$' +        }, +        'add_ie': ['Youtube'], +        'params': { +            'skip_download': True, +        },      }      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        flashvars_code = self._html_search_regex( -            r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars') -        flashvars = compat_parse_qs(flashvars_code) -        params = json.loads(flashvars['parameters'][0]) - -        e = compat_str(int(time.time()) + 24 * 60 * 60) -        base_video_url = params['location'] + '?e=' + e -        s = 'h3hg713fh32' -        h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest() -        video_url = base_video_url + '&h=' + h - -        uploader = self._html_search_regex( -            r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage, -            'uploader', fatal=False) +        iframe_path = self._html_search_regex( +            r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, +            'iframe path') +        iframe = self._download_webpage( +            compat_urlparse.urljoin(url, iframe_path), display_id, +            'Downloading iframe') +        youtube_id = self._search_regex( +            r'www.youtube.com/embed/(.{11})', iframe, 'youtube id')          return { -            'id': params['video_id'], +            '_type': 'url_transparent',              'display_id': display_id, -            'title': params['title'], -            'url': video_url, -            'description': params.get('description'), -            'thumbnail': params.get('main_thumb'), -            'duration': int_or_none(params.get('duration')), -            'uploader': uploader, +            'url': 'https://youtube.com/watch?v=%s' % youtube_id,          } diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py new file mode 100644 index 000000000..93567d1e3 --- /dev/null +++ b/youtube_dl/extractor/netzkino.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    clean_html, +    int_or_none, +    js_to_json, +    parse_iso8601, +) + + +class NetzkinoIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)' + +    _TEST = { +        'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', +        'md5': '92a3f8b76f8d7220acce5377ea5d4873', +        'info_dict': { +            'id': 'rakete-zum-mond', +            'ext': 'mp4', +            'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', +            'comments': 'mincount:3', +            'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', +            'upload_date': '20120813', +            'thumbnail': 're:https?://.*\.jpg$', +            'timestamp': 1344858571, +            'age_limit': 12, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        category_id = mobj.group('category') +        video_id = mobj.group('id') + +        api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id +        api_info = self._download_json(api_url, video_id) +        info = next( +            p for p in api_info['posts'] if p['slug'] == video_id) +        custom_fields = info['custom_fields'] + +        production_js = self._download_webpage( +            'http://www.netzkino.de/beta/dist/production.min.js', video_id, +            note='Downloading player code') +        avo_js = self._search_regex( +            r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})', +            production_js, 'URL templates') +        templates = self._parse_json( +            avo_js, video_id, transform_source=js_to_json) + +        suffix = { +            'hds': '.mp4/manifest.f4m', +            'hls': '.mp4/master.m3u8', +            'pmd': '.mp4', +        } +        film_fn = custom_fields['Streaming'][0] +        formats = [{ +            'format_id': key, +            'ext': 'mp4', +            'url': tpl.replace('{}', film_fn) + suffix[key], +        } for key, tpl in templates.items()] +        self._sort_formats(formats) + +        comments = [{ +            'timestamp': parse_iso8601(c.get('date'), delimiter=' '), +            'id': c['id'], +            'author': c['name'], +            'html': c['content'], +            'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], +        } for c in info.get('comments', [])] + +        return { +            'id': video_id, +            'formats': formats, +            'comments': comments, +            'title': info['title'], +            'age_limit': int_or_none(custom_fields.get('FSK')[0]), +            'timestamp': parse_iso8601(info.get('date'), delimiter=' '), +            'description': clean_html(info.get('content')), +            'thumbnail': info.get('thumbnail'), +            'playlist_title': api_info.get('title'), +            'playlist_id': category_id, +        } diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 3d35b11ac..c13ff0d65 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -22,7 +22,11 @@ class NormalbootsIE(InfoExtractor):              'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',              'uploader': 'JonTron',              'upload_date': '20140125', -        } +        }, +        'params': { +            # rtmp download +            'skip_download': True, +        },      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 43e8e619f..321ce5ce7 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -72,7 +72,7 @@ class NRKIE(InfoExtractor):  class NRKTVIE(InfoExtractor): -    _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})' +    _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?'      _TESTS = [          { @@ -85,7 +85,7 @@ class NRKTVIE(InfoExtractor):                  'description': 'md5:bdea103bc35494c143c6a9acdd84887a',                  'upload_date': '20140523',                  'duration': 1741.52, -            } +            },          },          {              'url': 'http://tv.nrk.no/program/mdfp15000514', @@ -97,39 +97,119 @@ class NRKTVIE(InfoExtractor):                  'description': 'md5:654c12511f035aed1e42bdf5db3b206a',                  'upload_date': '20140524',                  'duration': 4605.0, -            } +            },          }, +        { +            # single playlist video +            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', +            'md5': 'adbd1dbd813edaf532b0a253780719c2', +            'info_dict': { +                'id': 'MSPO40010515-part2', +                'ext': 'flv', +                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', +                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +                'upload_date': '20150106', +            }, +            'skip': 'Only works from Norway', +        }, +        { +            'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', +            'playlist': [ +                { +                    'md5': '9480285eff92d64f06e02a5367970a7a', +                    'info_dict': { +                        'id': 'MSPO40010515-part1', +                        'ext': 'flv', +                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', +                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +                        'upload_date': '20150106', +                    }, +                }, +                { +                    'md5': 'adbd1dbd813edaf532b0a253780719c2', +                    'info_dict': { +                        'id': 'MSPO40010515-part2', +                        'ext': 'flv', +                        'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', +                        'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +                        'upload_date': '20150106', +                    }, +                }, +            ], +            'info_dict': { +                'id': 'MSPO40010515', +                'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', +                'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', +                'upload_date': '20150106', +                'duration': 6947.5199999999995, +            }, +            'skip': 'Only works from Norway', +        }      ] +    def _extract_f4m(self, manifest_url, video_id): +        return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) +      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') - -        page = self._download_webpage(url, video_id) - -        title = self._html_search_meta('title', page, 'title') -        description = self._html_search_meta('description', page, 'description') -        thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) -        upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) -        duration = float_or_none( -            self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)) +        part_id = mobj.group('part_id') + +        webpage = self._download_webpage(url, video_id) + +        title = self._html_search_meta( +            'title', webpage, 'title') +        description = self._html_search_meta( +            'description', webpage, 'description') + +        thumbnail = self._html_search_regex( +            r'data-posterimage="([^"]+)"', +            webpage, 'thumbnail', fatal=False) +        upload_date = unified_strdate(self._html_search_meta( +            'rightsfrom', webpage, 'upload date', fatal=False)) +        duration = float_or_none(self._html_search_regex( +            r'data-duration="([^"]+)"', +            webpage, 'duration', fatal=False)) + +        # playlist +        parts = re.findall( +            r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) +        if parts: +            entries = [] +            for current_part_id, stream_url, part_title in parts: +                if part_id and current_part_id != part_id: +                    continue +                video_part_id = '%s-part%s' % (video_id, current_part_id) +                formats = self._extract_f4m(stream_url, video_part_id) +                entries.append({ +                    'id': video_part_id, +                    'title': part_title, +                    'description': description, +                    'thumbnail': thumbnail, +                    'upload_date': upload_date, +                    'formats': formats, +                }) +            if part_id: +                if entries: +                    return entries[0] +            else: +                playlist = self.playlist_result(entries, video_id, title, description) +                playlist.update({ +                    'thumbnail': thumbnail, +                    'upload_date': upload_date, +                    'duration': duration, +                }) +                return playlist          formats = [] -        f4m_url = re.search(r'data-media="([^"]+)"', page) +        f4m_url = re.search(r'data-media="([^"]+)"', webpage)          if f4m_url: -            formats.append({ -                'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', -                'format_id': 'f4m', -                'ext': 'flv', -            }) +            formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) -        m3u8_url = re.search(r'data-hls-media="([^"]+)"', page) +        m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage)          if m3u8_url: -            formats.append({ -                'url': m3u8_url.group(1), -                'format_id': 'm3u8', -            }) +            formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4'))          self._sort_formats(formats) diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 449d4836c..45716c75d 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -26,6 +26,7 @@ class PlayedIE(InfoExtractor):              'ext': 'flv',              'title': 'youtube-dl_test_video.mp4',          }, +        'skip': 'Removed for copyright infringement.',  # oh wow      }      def _real_extract(self, url): diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py new file mode 100644 index 000000000..0d706312e --- /dev/null +++ b/youtube_dl/extractor/radiobremen.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_duration + + +class RadioBremenIE(InfoExtractor): +    _VALID_URL = r'http?://(?:www\.)?radiobremen\.de/mediathek/(?:index\.html)?\?id=(?P<id>[0-9]+)' +    IE_NAME = 'radiobremen' + +    _TEST = { +        'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', +        'info_dict': { +            'id': '114720', +            'ext': 'mp4', +            'duration': 1685, +            'width': 512, +            'title': 'buten un binnen vom 22. Dezember', +            'thumbnail': 're:https?://.*\.jpg$', +            'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', +        }, +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        meta_url = "http://www.radiobremen.de/apps/php/mediathek/metadaten.php?id=%s" % video_id +        meta_doc = self._download_webpage( +            meta_url, video_id, 'Downloading metadata') +        title = self._html_search_regex( +            r"<h1.*>(?P<title>.+)</h1>", meta_doc, "title") +        description = self._html_search_regex( +            r"<p>(?P<description>.*)</p>", meta_doc, "description", fatal=False) +        duration = parse_duration(self._html_search_regex( +            r"Länge:</td>\s+<td>(?P<duration>[0-9]+:[0-9]+)</td>", +            meta_doc, "duration", fatal=False)) + +        page_doc = self._download_webpage( +            url, video_id, 'Downloading video information') +        mobj = re.search( +            r"ardformatplayerclassic\(\'playerbereich\',\'(?P<width>[0-9]+)\',\'.*\',\'(?P<video_id>[0-9]+)\',\'(?P<secret>[0-9]+)\',\'(?P<thumbnail>.+)\',\'\'\)", +            page_doc) +        video_url = ( +            "http://dl-ondemand.radiobremen.de/mediabase/%s/%s_%s_%s.mp4" % +            (video_id, video_id, mobj.group("secret"), mobj.group('width'))) + +        formats = [{ +            'url': video_url, +            'ext': 'mp4', +            'width': int(mobj.group("width")), +        }] +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'formats': formats, +            'thumbnail': mobj.group('thumbnail'), +        } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index d029b0ec5..a3ca79f2c 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -8,7 +8,7 @@ from ..utils import parse_duration  class RtlXlIE(InfoExtractor):      IE_NAME = 'rtlxl.nl' -    _VALID_URL = r'https?://www\.rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)' +    _VALID_URL = r'https?://(www\.)?rtlxl\.nl/#!/[^/]+/(?P<uuid>[^/?]+)'      _TEST = {          'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index b72b5a586..5b1c3577a 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -70,6 +70,37 @@ class RutubeIE(InfoExtractor):          } +class RutubeEmbedIE(InfoExtractor): +    IE_NAME = 'rutube:embed' +    IE_DESC = 'Rutube embedded videos' +    _VALID_URL = 'https?://rutube\.ru/video/embed/(?P<id>[0-9]+)' + +    _TEST = { +        'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', +        'info_dict': { +            'id': 'a10e53b86e8f349080f718582ce4c661', +            'ext': 'mp4', +            'upload_date': '20131223', +            'uploader_id': '297833', +            'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89', +            'uploader': 'subziro89 ILya', +            'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89', +        }, +        'params': { +            'skip_download': 'Requires ffmpeg', +        }, +    } + +    def _real_extract(self, url): +        embed_id = self._match_id(url) +        webpage = self._download_webpage(url, embed_id) + +        canonical_url = self._html_search_regex( +            r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage, +            'Canonical URL') +        return self.url_result(canonical_url, 'Rutube') + +  class RutubeChannelIE(InfoExtractor):      IE_NAME = 'rutube:channel'      IE_DESC = 'Rutube channels' diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py index c833fc8ee..6446d26dc 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/sexykarma.py @@ -24,7 +24,7 @@ class SexyKarmaIE(InfoExtractor):              'title': 'Taking a quick pee.',              'thumbnail': 're:^https?://.*\.jpg$',              'uploader': 'wildginger7', -            'upload_date': '20141007', +            'upload_date': '20141008',              'duration': 22,              'view_count': int,              'comment_count': int, @@ -45,6 +45,7 @@ class SexyKarmaIE(InfoExtractor):              'view_count': int,              'comment_count': int,              'categories': list, +            'age_limit': 18,          }      }, {          'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', @@ -61,6 +62,7 @@ class SexyKarmaIE(InfoExtractor):              'view_count': int,              'comment_count': int,              'categories': list, +            'age_limit': 18,          }      }] @@ -114,4 +116,5 @@ class SexyKarmaIE(InfoExtractor):              'view_count': view_count,              'comment_count': comment_count,              'categories': categories, +            'age_limit': 18,          } diff --git a/youtube_dl/extractor/soulanime.py b/youtube_dl/extractor/soulanime.py new file mode 100644 index 000000000..feef33e27 --- /dev/null +++ b/youtube_dl/extractor/soulanime.py @@ -0,0 +1,80 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    HEADRequest, +    urlhandle_detect_ext, +) + + +class SoulAnimeWatchingIE(InfoExtractor): +    IE_NAME = "soulanime:watching" +    IE_DESC = "SoulAnime video" +    _TEST = { +        'url': 'http://www.soul-anime.net/watching/seirei-tsukai-no-blade-dance-episode-9/', +        'md5': '05fae04abf72298098b528e98abf4298', +        'info_dict': { +            'id': 'seirei-tsukai-no-blade-dance-episode-9', +            'ext': 'mp4', +            'title': 'seirei-tsukai-no-blade-dance-episode-9', +            'description': 'seirei-tsukai-no-blade-dance-episode-9' +        } +    } +    _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/watch[^/]*/(?P<id>[^/]+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        domain = mobj.group('domain') + +        page = self._download_webpage(url, video_id) + +        video_url_encoded = self._html_search_regex( +            r'<div id="download">[^<]*<a href="(?P<url>[^"]+)"', page, 'url') +        video_url = "http://www.soul-anime." + domain + video_url_encoded + +        ext_req = HEADRequest(video_url) +        ext_handle = self._request_webpage( +            ext_req, video_id, note='Determining extension') +        ext = urlhandle_detect_ext(ext_handle) + +        return { +            'id': video_id, +            'url': video_url, +            'ext': ext, +            'title': video_id, +            'description': video_id +        } + + +class SoulAnimeSeriesIE(InfoExtractor): +    IE_NAME = "soulanime:series" +    IE_DESC = "SoulAnime Series" + +    _VALID_URL = r'http://[w.]*soul-anime\.(?P<domain>[^/]+)/anime./(?P<id>[^/]+)' + +    _EPISODE_REGEX = r'<option value="(/watch[^/]*/[^"]+)">[^<]*</option>' + +    _TEST = { +        'url': 'http://www.soul-anime.net/anime1/black-rock-shooter-tv/', +        'info_dict': { +            'id': 'black-rock-shooter-tv' +        }, +        'playlist_count': 8 +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        series_id = mobj.group('id') +        domain = mobj.group('domain') + +        pattern = re.compile(self._EPISODE_REGEX) + +        page = self._download_webpage(url, series_id, "Downloading series page") +        mobj = pattern.findall(page) + +        entries = [self.url_result("http://www.soul-anime." + domain + obj) for obj in mobj] + +        return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 6c3445d79..82675431f 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -57,9 +57,7 @@ class TeacherTubeIE(InfoExtractor):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          title = self._html_search_meta('title', webpage, 'title', fatal=True) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 944177426..10b3b706a 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -13,7 +13,7 @@ from ..compat import (  class TEDIE(SubtitlesInfoExtractor):      _VALID_URL = r'''(?x)          (?P<proto>https?://) -        (?P<type>www|embed)(?P<urlmain>\.ted\.com/ +        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/          (              (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist              | @@ -98,7 +98,7 @@ class TEDIE(SubtitlesInfoExtractor):      def _real_extract(self, url):          m = re.match(self._VALID_URL, url, re.VERBOSE) -        if m.group('type') == 'embed': +        if m.group('type').startswith('embed'):              desktop_url = m.group('proto') + 'www' + m.group('urlmain')              return self.url_result(desktop_url, 'TED')          name = m.group('name') diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 6e61cc9e2..025d0877c 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,15 +1,13 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  class TF1IE(InfoExtractor):      """TF1 uses the wat.tv player.""" -    _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html' -    _TEST = { +    _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' +    _TESTS = {          'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',          'info_dict': {              'id': '10635995', @@ -21,14 +19,26 @@ class TF1IE(InfoExtractor):              # Sometimes wat serves the whole file with the --test option              'skip_download': True,          }, +    }, { +        'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', +        'info_dict': { +            'id': '12043945', +            'ext': 'mp4', +            'title': 'Le grand Mystérioso - Chuggington', +            'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', +            'upload_date': '20150103', +        }, +        'params': { +            # Sometimes wat serves the whole file with the --test option +            'skip_download': True, +        },      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          embed_url = self._html_search_regex( -            r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url') +            r'["\'](https?://www.wat.tv/embedframe/.*?)["\']', webpage, 'embed url')          embed_page = self._download_webpage(embed_url, video_id,                                              'Downloading embed player page')          wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 161e47624..c89de5ba4 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -9,7 +9,7 @@ from .common import InfoExtractor  class TudouIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' +    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'      _TESTS = [{          'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',          'md5': '140a49ed444bd22f93330985d8475fcb', @@ -27,13 +27,6 @@ class TudouIE(InfoExtractor):              'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',              'thumbnail': 're:^https?://.*\.jpg$',          } -    }, { -        'url': 'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html', -        'info_dict': { -            'title': 'todo.mp4', -        }, -        'add_ie': ['Youku'], -        'skip': 'Only works from China'      }]      def _url_for_id(self, id, quality=None): @@ -45,8 +38,7 @@ class TudouIE(InfoExtractor):          return final_url      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(2) +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) @@ -87,4 +79,9 @@ class TudouIE(InfoExtractor):              }              result.append(part_info) -        return result +        return { +            '_type': 'multi_video', +            'entries': result, +            'id': video_id, +            'title': title, +        } diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 4ce5aeeba..b6b1f2568 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -24,7 +24,7 @@ class TuneInIE(InfoExtractor):      _INFO_DICT = {          'id': '34682',          'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', -        'ext': 'AAC', +        'ext': 'aac',          'thumbnail': 're:^https?://.*\.png$',          'location': 'Tacoma, WA',      } @@ -78,14 +78,21 @@ class TuneInIE(InfoExtractor):          for stream in streams:              if stream.get('Type') == 'Live':                  is_live = True +            reliability = stream.get('Reliability') +            format_note = ( +                'Reliability: %d%%' % reliability +                if reliability is not None else None)              formats.append({ +                'preference': ( +                    0 if reliability is None or reliability > 90 +                    else 1),                  'abr': stream.get('Bandwidth'), -                'ext': stream.get('MediaType'), +                'ext': stream.get('MediaType').lower(),                  'acodec': stream.get('MediaType'),                  'vcodec': 'none',                  'url': stream.get('Url'), -                # Sometimes streams with the highest quality do not exist -                'preference': stream.get('Reliability'), +                'source_preference': reliability, +                'format_note': format_note,              })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py new file mode 100644 index 000000000..619039e51 --- /dev/null +++ b/youtube_dl/extractor/vier.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class VierIE(InfoExtractor): +    IE_NAME = 'vier' +    _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' +    _TESTS = [{ +        'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', +        'info_dict': { +            'id': '16129', +            'display_id': 'het-wordt-warm-de-moestuin', +            'ext': 'mp4', +            'title': 'Het wordt warm in De Moestuin', +            'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, { +        'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', +        'only_matching': True, +    }, { +        'url': 'http://www.vier.be/video/v3/embed/16129', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        embed_id = mobj.group('embed_id') +        display_id = mobj.group('display_id') or embed_id + +        webpage = self._download_webpage(url, display_id) + +        video_id = self._search_regex( +            r'"nid"\s*:\s*"(\d+)"', webpage, 'video id') +        application = self._search_regex( +            r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod') +        filename = self._search_regex( +            r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename') + +        playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) +        formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') + +        title = self._og_search_title(webpage, default=display_id) +        description = self._og_search_description(webpage, default=None) +        thumbnail = self._og_search_thumbnail(webpage, default=None) + +        return { +            'id': video_id, +            'display_id': display_id, +            'title': title, +            'description': description, +            'thumbnail': thumbnail, +            'formats': formats, +        } + + +class VierVideosIE(InfoExtractor): +    IE_NAME = 'vier:videos' +    _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' +    _TESTS = [{ +        'url': 'http://www.vier.be/demoestuin/videos', +        'info_dict': { +            'id': 'demoestuin', +        }, +        'playlist_mincount': 153, +    }, { +        'url': 'http://www.vier.be/demoestuin/videos?page=6', +        'info_dict': { +            'id': 'demoestuin-page6', +        }, +        'playlist_mincount': 20, +    }, { +        'url': 'http://www.vier.be/demoestuin/videos?page=7', +        'info_dict': { +            'id': 'demoestuin-page7', +        }, +        'playlist_mincount': 13, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        program = mobj.group('program') + +        webpage = self._download_webpage(url, program) + +        page_id = mobj.group('page') +        if page_id: +            page_id = int(page_id) +            start_page = page_id +            last_page = start_page + 1 +            playlist_id = '%s-page%d' % (program, page_id) +        else: +            start_page = 0 +            last_page = int(self._search_regex( +                r'videos\?page=(\d+)">laatste</a>', +                webpage, 'last page', default=0)) + 1 +            playlist_id = program + +        entries = [] +        for current_page_id in range(start_page, last_page): +            current_page = self._download_webpage( +                'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), +                program, +                'Downloading page %d' % (current_page_id + 1)) if current_page_id != page_id else webpage +            page_entries = [ +                self.url_result('http://www.vier.be' + video_url, 'Vier') +                for video_url in re.findall( +                    r'<h3><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] +            entries.extend(page_entries) + +        return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 15f315298..944901e14 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -17,7 +17,6 @@ class VikiIE(SubtitlesInfoExtractor):      _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'      _TEST = {          'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', -        'md5': 'a21454021c2646f5433514177e2caa5f',          'info_dict': {              'id': '1023585v',              'ext': 'mp4', @@ -31,8 +30,7 @@ class VikiIE(SubtitlesInfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group(1) +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          title = self._og_search_title(webpage) diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index 33d370e1c..ee3d86117 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -14,28 +14,17 @@ class VimpleIE(InfoExtractor):      IE_DESC = 'Vimple.ru'      _VALID_URL = r'https?://(player.vimple.ru/iframe|vimple.ru)/(?P<id>[a-f0-9]{10,})'      _TESTS = [ -        # Quality: Large, from iframe          { -            'url': 'http://player.vimple.ru/iframe/b132bdfd71b546d3972f9ab9a25f201c', +            'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', +            'md5': '2e750a330ed211d3fd41821c6ad9a279',              'info_dict': { -                'id': 'b132bdfd71b546d3972f9ab9a25f201c', -                'title': 'great-escape-minecraft.flv', +                'id': 'c0f6b1687dcd4000a97ebe70068039cf',                  'ext': 'mp4', -                'duration': 352, -                'webpage_url': 'http://vimple.ru/b132bdfd71b546d3972f9ab9a25f201c', +                'title': 'Sunset', +                'duration': 20, +                'thumbnail': 're:https?://.*?\.jpg',              },          }, -        # Quality: Medium, from mainpage -        { -            'url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd', -            'info_dict': { -                'id': 'a15950562888453b8e6f9572dc8600cd', -                'title': 'DB 01', -                'ext': 'flv', -                'duration': 1484, -                'webpage_url': 'http://vimple.ru/a15950562888453b8e6f9572dc8600cd', -            } -        },      ]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 542e9198a..81e02a624 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -164,6 +164,14 @@ class VKIE(InfoExtractor):              self.to_screen('Youtube video detected')              return self.url_result(m_yt.group(1), 'Youtube') +        m_rutube = re.search( +            r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) +        if m_rutube is not None: +            self.to_screen('rutube video detected') +            rutube_url = self._proto_relative_url( +                m_rutube.group(1).replace('\\', '')) +            return self.url_result(rutube_url) +          m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page)          if m_opts:              m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 88bbbb219..c17bebd6e 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -10,14 +10,14 @@ from ..utils import (  class WashingtonPostIE(InfoExtractor): -    _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' +    _VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'      _TEST = {          'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',          'info_dict': {              'title': 'Sinkhole of bureaucracy',          },          'playlist': [{ -            'md5': 'c3f4b4922ffa259243f68e928db2db8c', +            'md5': '79132cc09ec5309fa590ae46e4cc31bc',              'info_dict': {                  'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',                  'ext': 'mp4', @@ -29,7 +29,7 @@ class WashingtonPostIE(InfoExtractor):                  'upload_date': '20140322',              },          }, { -            'md5': 'f645a07652c2950cd9134bb852c5f5eb', +            'md5': 'e1d5734c06865cc504ad99dc2de0d443',              'info_dict': {                  'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',                  'ext': 'mp4', @@ -44,10 +44,9 @@ class WashingtonPostIE(InfoExtractor):      }      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        page_id = mobj.group('id') - +        page_id = self._match_id(url)          webpage = self._download_webpage(url, page_id) +          title = self._og_search_title(webpage)          uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)          entries = [] diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 8e25ecf28..45466e31b 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,6 +1,7 @@  # -*- coding: utf-8 -*-  from __future__ import unicode_literals +import itertools  import re  from .common import InfoExtractor @@ -67,6 +68,10 @@ class WDRIE(InfoExtractor):                  'upload_date': '20140717',              },          }, +        { +            'url': 'http://www1.wdr.de/mediathek/video/sendungen/quarks_und_co/filterseite-quarks-und-co100.html', +            'playlist_mincount': 146, +        }      ]      def _real_extract(self, url): @@ -81,6 +86,27 @@ class WDRIE(InfoExtractor):                  self.url_result(page_url + href, 'WDR')                  for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)              ] + +            if entries:  # Playlist page +                return self.playlist_result(entries, page_id) + +            # Overview page +            entries = [] +            for page_num in itertools.count(2): +                hrefs = re.findall( +                    r'<li class="mediathekvideo"\s*>\s*<img[^>]*>\s*<a href="(/mediathek/video/[^"]+)"', +                    webpage) +                entries.extend( +                    self.url_result(page_url + href, 'WDR') +                    for href in hrefs) +                next_url_m = re.search( +                    r'<li class="nextToLast">\s*<a href="([^"]+)"', webpage) +                if not next_url_m: +                    break +                next_url = page_url + next_url_m.group(1) +                webpage = self._download_webpage( +                    next_url, page_id, +                    note='Downloading playlist page %d' % page_num)              return self.playlist_result(entries, page_id)          flashvars = compat_parse_qs( @@ -172,8 +198,7 @@ class WDRMausIE(InfoExtractor):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') +        video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          param_code = self._html_search_regex( @@ -224,5 +249,3 @@ class WDRMausIE(InfoExtractor):              'thumbnail': thumbnail,              'upload_date': upload_date,          } - -# TODO test _1 diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py new file mode 100644 index 000000000..396cf4e83 --- /dev/null +++ b/youtube_dl/extractor/webofstories.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class WebOfStoriesIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?webofstories\.com/play/(?:[^/]+/)?(?P<id>[0-9]+)' +    _VIDEO_DOMAIN = 'http://eu-mobile.webofstories.com/' +    _GREAT_LIFE_STREAMER = 'rtmp://eu-cdn1.webofstories.com/cfx/st/' +    _USER_STREAMER = 'rtmp://eu-users.webofstories.com/cfx/st/' +    _TESTS = [ +        { +            'url': 'http://www.webofstories.com/play/hans.bethe/71', +            'md5': '373e4dd915f60cfe3116322642ddf364', +            'info_dict': { +                'id': '4536', +                'ext': 'mp4', +                'title': 'The temperature of the sun', +                'thumbnail': 're:^https?://.*\.jpg$', +                'description': 'Hans Bethe talks about calculating the temperature of the sun', +                'duration': 238, +            } +        }, +        { +            'url': 'http://www.webofstories.com/play/55908', +            'md5': '2985a698e1fe3211022422c4b5ed962c', +            'info_dict': { +                'id': '55908', +                'ext': 'mp4', +                'title': 'The story of Gemmata obscuriglobus', +                'thumbnail': 're:^https?://.*\.jpg$', +                'description': 'Planctomycete talks about The story of Gemmata obscuriglobus', +                'duration': 169, +            } +        }, +    ] + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) +        title = self._og_search_title(webpage) +        description = self._html_search_meta('description', webpage) +        thumbnail = self._og_search_thumbnail(webpage) + +        story_filename = self._search_regex( +            r'\.storyFileName\("([^"]+)"\)', webpage, 'story filename') +        speaker_id = self._search_regex( +            r'\.speakerId\("([^"]+)"\)', webpage, 'speaker ID') +        story_id = self._search_regex( +            r'\.storyId\((\d+)\)', webpage, 'story ID') +        speaker_type = self._search_regex( +            r'\.speakerType\("([^"]+)"\)', webpage, 'speaker type') +        great_life = self._search_regex( +            r'isGreatLifeStory\s*=\s*(true|false)', webpage, 'great life story') +        is_great_life_series = great_life == 'true' +        duration = int_or_none(self._search_regex( +            r'\.duration\((\d+)\)', webpage, 'duration', fatal=False)) + +        # URL building, see: http://www.webofstories.com/scripts/player.js +        ms_prefix = '' +        if speaker_type.lower() == 'ms': +            ms_prefix = 'mini_sites/' + +        if is_great_life_series: +            mp4_url = '{0:}lives/{1:}/{2:}.mp4'.format( +                self._VIDEO_DOMAIN, speaker_id, story_filename) +            rtmp_ext = 'flv' +            streamer = self._GREAT_LIFE_STREAMER +            play_path = 'stories/{0:}/{1:}'.format( +                speaker_id, story_filename) +        else: +            mp4_url = '{0:}{1:}{2:}/{3:}.mp4'.format( +                self._VIDEO_DOMAIN, ms_prefix, speaker_id, story_filename) +            rtmp_ext = 'mp4' +            streamer = self._USER_STREAMER +            play_path = 'mp4:{0:}{1:}/{2}.mp4'.format( +                ms_prefix, speaker_id, story_filename) + +        formats = [{ +            'format_id': 'mp4_sd', +            'url': mp4_url, +        }, { +            'format_id': 'rtmp_sd', +            'page_url': url, +            'url': streamer, +            'ext': rtmp_ext, +            'play_path': play_path, +        }] + +        self._sort_formats(formats) + +        return { +            'id': story_id, +            'title': title, +            'formats': formats, +            'thumbnail': thumbnail, +            'description': description, +            'duration': duration, +        } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 6b37bcbc9..4527567f8 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -14,7 +14,7 @@ from ..utils import (  class XHamsterIE(InfoExtractor):      """Information Extractor for xHamster""" -    _VALID_URL = r'http://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' +    _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'      _TESTS = [          {              'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', @@ -39,7 +39,11 @@ class XHamsterIE(InfoExtractor):                  'duration': 200,                  'age_limit': 18,              } -        } +        }, +        { +            'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', +            'only_matching': True, +        },      ]      def _real_extract(self, url): @@ -57,7 +61,8 @@ class XHamsterIE(InfoExtractor):          video_id = mobj.group('id')          seo = mobj.group('seo') -        mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo) +        proto = mobj.group('proto') +        mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)          webpage = self._download_webpage(mrss_url, video_id)          title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title') diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index cf74d4fd5..e8490b028 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -40,7 +40,7 @@ class XTubeIE(InfoExtractor):              r'<p class="title">([^<]+)', webpage, 'title')          video_uploader = self._html_search_regex(              [r"var\s+contentOwnerId\s*=\s*'([^']+)", -             r'By:\s*<a href="/community/profile\.php?user=([^"]+)'], +             r'By:\s*<a href="/community/profile\.php\?user=([^"]+)'],              webpage, 'uploader', fatal=False)          video_description = self._html_search_regex(              r'<p class="fieldsDesc">([^<]+)', @@ -95,6 +95,7 @@ class XTubeUserIE(InfoExtractor):          'url': 'http://www.xtube.com/community/profile.php?user=greenshowers',          'info_dict': {              'id': 'greenshowers', +            'age_limit': 18,          },          'playlist_mincount': 155,      } @@ -124,6 +125,7 @@ class XTubeUserIE(InfoExtractor):          return {              '_type': 'playlist',              'id': username, +            'age_limit': 18,              'entries': [{                  '_type': 'url',                  'url': eurl, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7f5aeb25b..bc18276d6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -256,7 +256,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, -        '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, +        '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},  # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)          '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, @@ -264,9 +264,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},          # Dash mp4 audio -        '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, -        '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, -        '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, +        '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50}, +        '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50}, +        '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50},          # Dash webm          '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, @@ -287,7 +287,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},          '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},          '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, +        '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},          '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, +        '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},          # Dash webm audio          '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -412,7 +414,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'id': 'HtVdAasjOgU',                  'ext': 'mp4',                  'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', -                'description': 'md5:eca57043abae25130f58f655ad9a7771', +                'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',                  'uploader': 'The Witcher',                  'uploader_id': 'WitcherGame',                  'upload_date': '20140605', @@ -736,6 +738,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'format_id': format_id,                  'url': video_url,                  'width': int_or_none(r.attrib.get('width')), +                'height': int_or_none(r.attrib.get('height')),                  'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),                  'asr': int_or_none(r.attrib.get('audioSamplingRate')),                  'filesize': filesize, @@ -746,7 +749,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                      fo for fo in formats                      if fo['format_id'] == format_id)              except StopIteration: -                f.update(self._formats.get(format_id, {})) +                f.update(self._formats.get(format_id, {}).items())                  formats.append(f)              else:                  existing_format.update(f) @@ -1040,6 +1043,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                      self.report_warning(                          'Skipping DASH manifest: %r' % e, video_id)                  else: +                    # Hide the formats we found through non-DASH +                    dash_keys = set(df['format_id'] for df in dash_formats) +                    for f in formats: +                        if f['format_id'] in dash_keys: +                            f['format_id'] = 'nondash-%s' % f['format_id'] +                            f['preference'] = f.get('preference', 0) - 10000                      formats.extend(dash_formats)          self._sort_formats(formats) @@ -1199,9 +1208,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          if playlist_id.startswith('RD'):              # Mixes require a custom extraction process              return self._extract_mix(playlist_id) -        if playlist_id.startswith('TL'): -            raise ExtractorError('For downloading YouTube.com top lists, use ' -                                 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)          url = self._TEMPLATE_URL % playlist_id          page = self._download_webpage(url, playlist_id) @@ -1247,49 +1253,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):          return self.playlist_result(url_results, playlist_id, playlist_title) -class YoutubeTopListIE(YoutubePlaylistIE): -    IE_NAME = 'youtube:toplist' -    IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' -               ' (Example: "yttoplist:music:Top Tracks")') -    _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' -    _TESTS = [{ -        'url': 'yttoplist:music:Trending', -        'playlist_mincount': 5, -        'skip': 'Only works for logged-in users', -    }] - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        channel = mobj.group('chann') -        title = mobj.group('title') -        query = compat_urllib_parse.urlencode({'title': title}) -        channel_page = self._download_webpage( -            'https://www.youtube.com/%s' % channel, title) -        link = self._html_search_regex( -            r'''(?x) -                <a\s+href="([^"]+)".*?>\s* -                <span\s+class="branded-page-module-title-text">\s* -                <span[^>]*>.*?%s.*?</span>''' % re.escape(query), -            channel_page, 'list') -        url = compat_urlparse.urljoin('https://www.youtube.com/', link) - -        video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' -        ids = [] -        # sometimes the webpage doesn't contain the videos -        # retry until we get them -        for i in itertools.count(0): -            msg = 'Downloading Youtube mix' -            if i > 0: -                msg += ', retry #%d' % i - -            webpage = self._download_webpage(url, title, msg) -            ids = orderedSet(re.findall(video_re, webpage)) -            if ids: -                break -        url_results = self._ids_to_results(ids) -        return self.playlist_result(url_results, playlist_title=title) - -  class YoutubeChannelIE(InfoExtractor):      IE_DESC = 'YouTube.com channels'      _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' @@ -1701,3 +1664,20 @@ class YoutubeTruncatedURLIE(InfoExtractor):              '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '              ' or simply  youtube-dl BaW_jenozKc  .',              expected=True) + + +class YoutubeTruncatedIDIE(InfoExtractor): +    IE_NAME = 'youtube:truncated_id' +    IE_DESC = False  # Do not list +    _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' + +    _TESTS = [{ +        'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        raise ExtractorError( +            'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), +            expected=True) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 74c76a9a0..98f15177b 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -119,7 +119,7 @@ class ZDFChannelIE(InfoExtractor):          'info_dict': {              'id': '1586442',          }, -        'playlist_count': 4, +        'playlist_count': 3,      }      _PAGE_SIZE = 50 diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 21c452141..14006178d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -109,7 +109,7 @@ def parseOpts(overrideArguments=None):      kw = {          'version': __version__,          'formatter': fmt, -        'usage': '%prog [options] url [url...]', +        'usage': '%prog [OPTIONS] URL [URL...]',          'conflict_handler': 'resolve',      } @@ -267,10 +267,12 @@ def parseOpts(overrideArguments=None):          action='store', dest='format', metavar='FORMAT', default=None,          help=(              'video format code, specify the order of preference using' -            ' slashes: -f 22/17/18 .  -f mp4 , -f m4a and  -f flv  are also' -            ' supported. You can also use the special names "best",' -            ' "bestvideo", "bestaudio", "worst", "worstvideo" and' -            ' "worstaudio". By default, youtube-dl will pick the best quality.' +            ' slashes, as in -f 22/17/18 . ' +            ' Instead of format codes, you can select by extension for the ' +            'extensions aac, m4a, mp3, mp4, ogg, wav, webm. ' +            'You can also use the special names "best",' +            ' "bestvideo", "bestaudio", "worst". ' +            ' By default, youtube-dl will pick the best quality.'              ' Use commas to download multiple audio formats, such as'              ' -f  136/137/mp4/bestvideo,140/m4a/bestaudio.'              ' You can merge the video and audio of two formats into a single' @@ -300,6 +302,12 @@ def parseOpts(overrideArguments=None):          '--youtube-skip-dash-manifest',          action='store_false', dest='youtube_include_dash_manifest',          help='Do not download the DASH manifest on YouTube videos') +    video_format.add_option( +        '--merge-output-format', +        action='store', dest='merge_output_format', metavar='FORMAT', default=None, +        help=( +            'If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.' +            'Ignored if no merge is required'))      subtitles = optparse.OptionGroup(parser, 'Subtitle Options')      subtitles.add_option( @@ -444,6 +452,11 @@ def parseOpts(overrideArguments=None):          action='store_true', dest='dump_single_json', default=False,          help='simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole playlist information in a single line.')      verbosity.add_option( +        '--print-json', +        action='store_true', dest='print_json', default=False, +        help='Be quiet and print the video information as JSON (video is still being downloaded).', +    ) +    verbosity.add_option(          '--newline',          action='store_true', dest='progress_with_newline', default=False,          help='output progress bar as new lines') diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 048525efc..d1b342c7a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -80,8 +80,9 @@ class FFmpegPostProcessor(PostProcessor):          files_cmd = []          for path in input_paths: -            files_cmd.extend(['-i', encodeFilename(path, True)]) -        cmd = ([self._executable, '-y'] + files_cmd +            files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)]) +        cmd = ([encodeFilename(self._executable, True), encodeArgument('-y')] + +               files_cmd                 + [encodeArgument(o) for o in opts] +                 [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) @@ -122,8 +123,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):              raise PostProcessingError('ffprobe or avprobe not found. Please install one.')          try:              cmd = [ -                self._probe_executable, -                '-show_streams', +                encodeFilename(self._probe_executable, True), +                encodeArgument('-show_streams'),                  encodeFilename(self._ffmpeg_filename_argument(path), True)]              handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)              output = handle.communicate()[0] @@ -520,7 +521,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):  class FFmpegMergerPP(FFmpegPostProcessor):      def run(self, info):          filename = info['filepath'] -        args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0', '-shortest'] +        args = ['-c', 'copy', '-map', '0:v:0', '-map', '1:a:0']          self._downloader.to_screen('[ffmpeg] Merging formats into "%s"' % filename)          self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)          return True, info diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 2d2703368..3f9c5249d 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -13,6 +13,7 @@ from .compat import (      compat_str,      compat_urllib_request,  ) +from .utils import make_HTTPS_handler  from .version import __version__ @@ -58,9 +59,12 @@ def update_self(to_screen, verbose):          to_screen('It looks like you installed youtube-dl with a package manager, pip, setup.py or a tarball. Please use that to update.')          return +    https_handler = make_HTTPS_handler(False) +    opener = compat_urllib_request.build_opener(https_handler) +      # Check if there is a new version      try: -        newversion = compat_urllib_request.urlopen(VERSION_URL).read().decode('utf-8').strip() +        newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()      except:          if verbose:              to_screen(compat_str(traceback.format_exc())) @@ -72,7 +76,7 @@ def update_self(to_screen, verbose):      # Download and check versions info      try: -        versions_info = compat_urllib_request.urlopen(JSON_URL).read().decode('utf-8') +        versions_info = opener.open(JSON_URL).read().decode('utf-8')          versions_info = json.loads(versions_info)      except:          if verbose: @@ -120,7 +124,7 @@ def update_self(to_screen, verbose):              return          try: -            urlh = compat_urllib_request.urlopen(version['exe'][0]) +            urlh = opener.open(version['exe'][0])              newcontent = urlh.read()              urlh.close()          except (IOError, OSError): @@ -166,7 +170,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"      # Zip unix package      elif isinstance(globals().get('__loader__'), zipimporter):          try: -            urlh = compat_urllib_request.urlopen(version['bin'][0]) +            urlh = opener.open(version['bin'][0])              newcontent = urlh.read()              urlh.close()          except (IOError, OSError): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index efbe64fb3..079e8d2c3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -205,6 +205,10 @@ def get_element_by_attribute(attribute, value, html):  def clean_html(html):      """Clean an HTML snippet into a readable string""" + +    if html is None:  # Convenience for sanitizing descriptions etc. +        return html +      # Newline vs <br />      html = html.replace('\n', ' ')      html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) @@ -1550,3 +1554,23 @@ def ytdl_is_updateable():  def args_to_str(args):      # Get a short string representation for a subprocess command      return ' '.join(shlex_quote(a) for a in args) + + +def urlhandle_detect_ext(url_handle): +    try: +        url_handle.headers +        getheader = lambda h: url_handle.headers[h] +    except AttributeError:  # Python < 3 +        getheader = url_handle.info().getheader + +    return getheader('Content-Type').split("/")[1] + + +def age_restricted(content_limit, age_limit): +    """ Returns True iff the content should be blocked """ + +    if age_limit is None:  # No limit set +        return False +    if content_limit is None: +        return False  # Content available for everyone +    return age_limit < content_limit diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1420af746..8c57c7413 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2014.12.17.2' +__version__ = '2015.01.09.2' | 
