diff options
| author | rzhxeo <rzhxeot7z81b4700@mailcatch.com> | 2013-11-18 00:27:06 +0100 | 
|---|---|---|
| committer | rzhxeo <rzhxeot7z81b4700@mailcatch.com> | 2013-11-18 00:27:06 +0100 | 
| commit | 2b35c9ef742bf261078ea10c6c0bba848db1a0df (patch) | |
| tree | fe80c838c7529c8cab6f1b44d730a2849cd68c48 | |
| parent | 4894fe8c5baec8b1f21ac6fdebe08175abc7f094 (diff) | |
| parent | 73c566695fac926e7e9e6922fe4e6d82c64a1850 (diff) | |
Merge branch 'master' into rtmpdump
Conflicts:
	youtube_dl/FileDownloader.py
Merge
64 files changed, 1653 insertions, 638 deletions
| @@ -92,12 +92,14 @@ which means you can modify it, redistribute it or use it however you like.                                 ownloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' .      --autonumber-size NUMBER   Specifies the number of digits in %(autonumber)s                                 when it is present in output filename template or -                               --autonumber option is given +                               --auto-number option is given      --restrict-filenames       Restrict filenames to only ASCII characters, and                                 avoid "&" and spaces in filenames      -a, --batch-file FILE      file containing URLs to download ('-' for stdin)      -w, --no-overwrites        do not overwrite files -    -c, --continue             resume partially downloaded files +    -c, --continue             force resume of partially downloaded files. By +                               default, youtube-dl will resume downloads if +                               possible.      --no-continue              do not resume partially downloaded files (restart                                 from beginning)      --cookies FILE             file to read cookies from and dump cookie jar in @@ -11,6 +11,7 @@ try:      setuptools_available = True  except ImportError:      from distutils.core import setup +    setuptools_available = False  try:      # This will create an exe that needs Microsoft Visual C++ 2008 diff --git a/test/helper.py b/test/helper.py index 777119ea5..d7bf7a828 100644 --- a/test/helper.py +++ b/test/helper.py @@ -5,9 +5,11 @@ import json  import os.path  import re  import types +import sys  import youtube_dl.extractor  from youtube_dl import YoutubeDL +from youtube_dl.utils import preferredencoding  def global_setup(): @@ -33,6 +35,21 @@ def try_rm(filename):              raise +def report_warning(message): +    ''' +    Print the message to stderr, it will be prefixed with 'WARNING:' +    If stderr is a tty file the 'WARNING:' will be colored +    ''' +    if sys.stderr.isatty() and os.name != 'nt': +        _msg_header = u'\033[0;33mWARNING:\033[0m' +    else: +        _msg_header = u'WARNING:' +    output = u'%s %s\n' % (_msg_header, message) +    if 'b' in getattr(sys.stderr, 'mode', '') or sys.version_info[0] < 3: +        output = output.encode(preferredencoding()) +    sys.stderr.write(output) + +  class FakeYDL(YoutubeDL):      def __init__(self, override=None):          # Different instances of the downloader can't share the same dictionary diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f8cd1bdce..58cf9c313 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -62,10 +62,10 @@ class TestFormatSelection(unittest.TestCase):      def test_format_limit(self):          formats = [ -            {u'format_id': u'meh'}, -            {u'format_id': u'good'}, -            {u'format_id': u'great'}, -            {u'format_id': u'excellent'}, +            {u'format_id': u'meh', u'url': u'http://example.com/meh'}, +            {u'format_id': u'good', u'url': u'http://example.com/good'}, +            {u'format_id': u'great', u'url': u'http://example.com/great'}, +            {u'format_id': u'excellent', u'url': u'http://example.com/exc'},          ]          info_dict = {              u'formats': formats, u'extractor': u'test', 'id': 'testvid'} @@ -128,6 +128,18 @@ class TestFormatSelection(unittest.TestCase):          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'35') +    def test_add_extra_info(self): +        test_dict = { +            'extractor': 'Foo', +        } +        extra_info = { +            'extractor': 'Bar', +            'playlist': 'funny videos', +        } +        YDL.add_extra_info(test_dict, extra_info) +        self.assertEqual(test_dict['extractor'], 'Foo') +        self.assertEqual(test_dict['playlist'], 'funny videos') +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py deleted file mode 100644 index c596415c4..000000000 --- a/test/test_dailymotion_subtitles.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import FakeYDL, global_setup, md5 -global_setup() - - -from youtube_dl.extractor import DailymotionIE - -class TestDailymotionSubtitles(unittest.TestCase): -    def setUp(self): -        self.DL = FakeYDL() -        self.url = 'http://www.dailymotion.com/video/xczg00' -    def getInfoDict(self): -        IE = DailymotionIE(self.DL) -        info_dict = IE.extract(self.url) -        return info_dict -    def getSubtitles(self): -        info_dict = self.getInfoDict() -        return info_dict[0]['subtitles'] -    def test_no_writesubtitles(self): -        subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) -    def test_subtitles(self): -        self.DL.params['writesubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') -    def test_subtitles_lang(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitleslangs'] = ['fr'] -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') -    def test_allsubtitles(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['allsubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles.keys()), 5) -    def test_list_subtitles(self): -        self.DL.expect_warning(u'Automatic Captions not supported by this server') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) -    def test_automatic_captions(self): -        self.DL.expect_warning(u'Automatic Captions not supported by this server') -        self.DL.params['writeautomaticsub'] = True -        self.DL.params['subtitleslang'] = ['en'] -        subtitles = self.getSubtitles() -        self.assertTrue(len(subtitles.keys()) == 0) -    def test_nosubtitles(self): -        self.DL.expect_warning(u'video doesn\'t have subtitles') -        self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' -        self.DL.params['writesubtitles'] = True -        self.DL.params['allsubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) -    def test_multiple_langs(self): -        self.DL.params['writesubtitles'] = True -        langs = ['es', 'fr', 'de'] -        self.DL.params['subtitleslangs'] = langs -        subtitles = self.getSubtitles() -        for lang in langs: -            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) - -if __name__ == '__main__': -    unittest.main() diff --git a/test/test_download.py b/test/test_download.py index b9a9be11d..16f200809 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -6,7 +6,14 @@ import sys  import unittest  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_params, get_testcases, global_setup, try_rm, md5 +from test.helper import ( +    get_params, +    get_testcases, +    global_setup, +    try_rm, +    md5, +    report_warning +)  global_setup() @@ -19,10 +26,12 @@ import youtube_dl.YoutubeDL  from youtube_dl.utils import (      compat_str,      compat_urllib_error, +    compat_HTTPError,      DownloadError,      ExtractorError,      UnavailableVideoError,  ) +from youtube_dl.extractor import get_info_extractor  RETRIES = 3 @@ -55,17 +64,25 @@ def generator(test_case):      def test_template(self):          ie = youtube_dl.extractor.get_info_extractor(test_case['name']) +        other_ies = [get_info_extractor(ie_key) for ie_key in test_case.get('add_ie', [])]          def print_skipping(reason):              print('Skipping %s: %s' % (test_case['name'], reason)) -        if not ie._WORKING: +        if not ie.working():              print_skipping('IE marked as not _WORKING')              return -        if 'playlist' not in test_case and not test_case['file']: -            print_skipping('No output file specified') -            return +        if 'playlist' not in test_case: +            info_dict = test_case.get('info_dict', {}) +            if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')): +                print_skipping('The output file cannot be know, the "file" ' +                    'key is missing or the info_dict is incomplete') +                return          if 'skip' in test_case:              print_skipping(test_case['skip'])              return +        for other_ie in other_ies: +            if not other_ie.working(): +                print_skipping(u'test depends on %sIE, marked as not WORKING' % other_ie.ie_key()) +                return          params = get_params(test_case.get('params', {})) @@ -77,35 +94,47 @@ def generator(test_case):                  finished_hook_called.add(status['filename'])          ydl.fd.add_progress_hook(_hook) +        def get_tc_filename(tc): +            return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) +          test_cases = test_case.get('playlist', [test_case]) -        for tc in test_cases: -            try_rm(tc['file']) -            try_rm(tc['file'] + '.part') -            try_rm(tc['file'] + '.info.json') +        def try_rm_tcs_files(): +            for tc in test_cases: +                tc_filename = get_tc_filename(tc) +                try_rm(tc_filename) +                try_rm(tc_filename + '.part') +                try_rm(tc_filename + '.info.json') +        try_rm_tcs_files()          try: -            for retry in range(1, RETRIES + 1): +            try_num = 1 +            while True:                  try:                      ydl.download([test_case['url']])                  except (DownloadError, ExtractorError) as err: -                    if retry == RETRIES: raise -                      # Check if the exception is not a network related one -                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): +                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):                          raise -                    print('Retrying: {0} failed tries\n\n##########\n\n'.format(retry)) +                    if try_num == RETRIES: +                        report_warning(u'Failed due to network errors, skipping...') +                        return + +                    print('Retrying: {0} failed tries\n\n##########\n\n'.format(try_num)) + +                    try_num += 1                  else:                      break              for tc in test_cases: +                tc_filename = get_tc_filename(tc)                  if not test_case.get('params', {}).get('skip_download', False): -                    self.assertTrue(os.path.exists(tc['file']), msg='Missing file ' + tc['file']) -                    self.assertTrue(tc['file'] in finished_hook_called) -                self.assertTrue(os.path.exists(tc['file'] + '.info.json')) +                    self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename) +                    self.assertTrue(tc_filename in finished_hook_called) +                self.assertTrue(os.path.exists(tc_filename + '.info.json'))                  if 'md5' in tc: -                    md5_for_file = _file_md5(tc['file']) +                    md5_for_file = _file_md5(tc_filename)                      self.assertEqual(md5_for_file, tc['md5']) -                with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: +                with io.open(tc_filename + '.info.json', encoding='utf-8') as infof:                      info_dict = json.load(infof)                  for (info_field, expected) in tc.get('info_dict', {}).items():                      if isinstance(expected, compat_str) and expected.startswith('md5:'): @@ -125,11 +154,11 @@ def generator(test_case):                  # Check for the presence of mandatory fields                  for key in ('id', 'url', 'title', 'ext'):                      self.assertTrue(key in info_dict.keys() and info_dict[key]) +                # Check for mandatory fields that are automatically set by YoutubeDL +                for key in ['webpage_url', 'extractor', 'extractor_key']: +                    self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)          finally: -            for tc in test_cases: -                try_rm(tc['file']) -                try_rm(tc['file'] + '.part') -                try_rm(tc['file'] + '.info.json') +            try_rm_tcs_files()      return test_template diff --git a/test/test_playlists.py b/test/test_playlists.py index d6a8d56df..706b6bdca 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -17,9 +17,11 @@ from youtube_dl.extractor import (      DailymotionUserIE,      VimeoChannelIE,      UstreamChannelIE, +    SoundcloudSetIE,      SoundcloudUserIE,      LivestreamIE,      NHLVideocenterIE, +    BambuserChannelIE,  ) @@ -60,6 +62,14 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], u'5124905')          self.assertTrue(len(result['entries']) >= 11) +    def test_soundcloud_set(self): +        dl = FakeYDL() +        ie = SoundcloudSetIE(dl) +        result = ie.extract('https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'The Royal Concept EP') +        self.assertTrue(len(result['entries']) >= 6) +      def test_soundcloud_user(self):          dl = FakeYDL()          ie = SoundcloudUserIE(dl) @@ -85,5 +95,13 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['title'], u'Highlights')          self.assertEqual(len(result['entries']), 12) +    def test_bambuser_channel(self): +        dl = FakeYDL() +        ie = BambuserChannelIE(dl) +        result = ie.extract('http://bambuser.com/channel/pixelversity') +        self.assertIsPlaylist(result) +        self.assertEqual(result['title'], u'pixelversity') +        self.assertTrue(len(result['entries']) >= 66) +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_subtitles.py b/test/test_subtitles.py new file mode 100644 index 000000000..06a304879 --- /dev/null +++ b/test/test_subtitles.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() + + +from youtube_dl.extractor import ( +    YoutubeIE, +    DailymotionIE, +    TEDIE, +) + + +class BaseTestSubtitles(unittest.TestCase): +    url = None +    IE = None +    def setUp(self): +        self.DL = FakeYDL() +        self.ie = self.IE(self.DL) + +    def getInfoDict(self): +        info_dict = self.ie.extract(self.url) +        return info_dict + +    def getSubtitles(self): +        info_dict = self.getInfoDict() +        return info_dict['subtitles'] + + +class TestYoutubeSubtitles(BaseTestSubtitles): +    url = 'QRS8MkLhQmM' +    IE = YoutubeIE + +    def getSubtitles(self): +        info_dict = self.getInfoDict() +        return info_dict[0]['subtitles'] + +    def test_youtube_no_writesubtitles(self): +        self.DL.params['writesubtitles'] = False +        subtitles = self.getSubtitles() +        self.assertEqual(subtitles, None) + +    def test_youtube_subtitles(self): +        self.DL.params['writesubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') + +    def test_youtube_subtitles_lang(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['subtitleslangs'] = ['it'] +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') + +    def test_youtube_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(len(subtitles.keys()), 13) + +    def test_youtube_subtitles_sbv_format(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['subtitlesformat'] = 'sbv' +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') + +    def test_youtube_subtitles_vtt_format(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['subtitlesformat'] = 'vtt' +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') + +    def test_youtube_list_subtitles(self): +        self.DL.expect_warning(u'Video doesn\'t have automatic captions') +        self.DL.params['listsubtitles'] = True +        info_dict = self.getInfoDict() +        self.assertEqual(info_dict, None) + +    def test_youtube_automatic_captions(self): +        self.url = '8YoUxe5ncPo' +        self.DL.params['writeautomaticsub'] = True +        self.DL.params['subtitleslangs'] = ['it'] +        subtitles = self.getSubtitles() +        self.assertTrue(subtitles['it'] is not None) + +    def test_youtube_nosubtitles(self): +        self.DL.expect_warning(u'video doesn\'t have subtitles') +        self.url = 'sAjKT8FhjI8' +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(len(subtitles), 0) + +    def test_youtube_multiple_langs(self): +        self.url = 'QRS8MkLhQmM' +        self.DL.params['writesubtitles'] = True +        langs = ['it', 'fr', 'de'] +        self.DL.params['subtitleslangs'] = langs +        subtitles = self.getSubtitles() +        for lang in langs: +            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) + + +class TestDailymotionSubtitles(BaseTestSubtitles): +    url = 'http://www.dailymotion.com/video/xczg00' +    IE = DailymotionIE + +    def test_no_writesubtitles(self): +        subtitles = self.getSubtitles() +        self.assertEqual(subtitles, None) + +    def test_subtitles(self): +        self.DL.params['writesubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['en']), '976553874490cba125086bbfea3ff76f') + +    def test_subtitles_lang(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['subtitleslangs'] = ['fr'] +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['fr']), '594564ec7d588942e384e920e5341792') + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(len(subtitles.keys()), 5) + +    def test_list_subtitles(self): +        self.DL.expect_warning(u'Automatic Captions not supported by this server') +        self.DL.params['listsubtitles'] = True +        info_dict = self.getInfoDict() +        self.assertEqual(info_dict, None) + +    def test_automatic_captions(self): +        self.DL.expect_warning(u'Automatic Captions not supported by this server') +        self.DL.params['writeautomaticsub'] = True +        self.DL.params['subtitleslang'] = ['en'] +        subtitles = self.getSubtitles() +        self.assertTrue(len(subtitles.keys()) == 0) + +    def test_nosubtitles(self): +        self.DL.expect_warning(u'video doesn\'t have subtitles') +        self.url = 'http://www.dailymotion.com/video/x12u166_le-zapping-tele-star-du-08-aout-2013_tv' +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(len(subtitles), 0) + +    def test_multiple_langs(self): +        self.DL.params['writesubtitles'] = True +        langs = ['es', 'fr', 'de'] +        self.DL.params['subtitleslangs'] = langs +        subtitles = self.getSubtitles() +        for lang in langs: +            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) + + +class TestTedSubtitles(BaseTestSubtitles): +    url = 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html' +    IE = TEDIE + +    def test_no_writesubtitles(self): +        subtitles = self.getSubtitles() +        self.assertEqual(subtitles, None) + +    def test_subtitles(self): +        self.DL.params['writesubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['en']), '2154f31ff9b9f89a0aa671537559c21d') + +    def test_subtitles_lang(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['subtitleslangs'] = ['fr'] +        subtitles = self.getSubtitles() +        self.assertEqual(md5(subtitles['fr']), '7616cbc6df20ec2c1204083c83871cf6') + +    def test_allsubtitles(self): +        self.DL.params['writesubtitles'] = True +        self.DL.params['allsubtitles'] = True +        subtitles = self.getSubtitles() +        self.assertEqual(len(subtitles.keys()), 28) + +    def test_list_subtitles(self): +        self.DL.expect_warning(u'Automatic Captions not supported by this server') +        self.DL.params['listsubtitles'] = True +        info_dict = self.getInfoDict() +        self.assertEqual(info_dict, None) + +    def test_automatic_captions(self): +        self.DL.expect_warning(u'Automatic Captions not supported by this server') +        self.DL.params['writeautomaticsub'] = True +        self.DL.params['subtitleslang'] = ['en'] +        subtitles = self.getSubtitles() +        self.assertTrue(len(subtitles.keys()) == 0) + +    def test_multiple_langs(self): +        self.DL.params['writesubtitles'] = True +        langs = ['es', 'fr', 'de'] +        self.DL.params['subtitleslangs'] = langs +        subtitles = self.getSubtitles() +        for lang in langs: +            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py deleted file mode 100644 index 00430a338..000000000 --- a/test/test_youtube_subtitles.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python - -# Allow direct execution -import os -import sys -import unittest -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from test.helper import FakeYDL, global_setup, md5 -global_setup() - - -from youtube_dl.extractor import YoutubeIE - - -class TestYoutubeSubtitles(unittest.TestCase): -    def setUp(self): -        self.DL = FakeYDL() -        self.url = 'QRS8MkLhQmM' - -    def getInfoDict(self): -        IE = YoutubeIE(self.DL) -        info_dict = IE.extract(self.url) -        return info_dict - -    def getSubtitles(self): -        info_dict = self.getInfoDict() -        return info_dict[0]['subtitles'] - -    def test_youtube_no_writesubtitles(self): -        self.DL.params['writesubtitles'] = False -        subtitles = self.getSubtitles() -        self.assertEqual(subtitles, None) - -    def test_youtube_subtitles(self): -        self.DL.params['writesubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - -    def test_youtube_subtitles_lang(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitleslangs'] = ['it'] -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') - -    def test_youtube_allsubtitles(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['allsubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles.keys()), 13) - -    def test_youtube_subtitles_sbv_format(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitlesformat'] = 'sbv' -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') - -    def test_youtube_subtitles_vtt_format(self): -        self.DL.params['writesubtitles'] = True -        self.DL.params['subtitlesformat'] = 'vtt' -        subtitles = self.getSubtitles() -        self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') - -    def test_youtube_list_subtitles(self): -        self.DL.expect_warning(u'Video doesn\'t have automatic captions') -        self.DL.params['listsubtitles'] = True -        info_dict = self.getInfoDict() -        self.assertEqual(info_dict, None) - -    def test_youtube_automatic_captions(self): -        self.url = '8YoUxe5ncPo' -        self.DL.params['writeautomaticsub'] = True -        self.DL.params['subtitleslangs'] = ['it'] -        subtitles = self.getSubtitles() -        self.assertTrue(subtitles['it'] is not None) - -    def test_youtube_nosubtitles(self): -        self.DL.expect_warning(u'video doesn\'t have subtitles') -        self.url = 'sAjKT8FhjI8' -        self.DL.params['writesubtitles'] = True -        self.DL.params['allsubtitles'] = True -        subtitles = self.getSubtitles() -        self.assertEqual(len(subtitles), 0) - -    def test_youtube_multiple_langs(self): -        self.url = 'QRS8MkLhQmM' -        self.DL.params['writesubtitles'] = True -        langs = ['it', 'fr', 'de'] -        self.DL.params['subtitleslangs'] = langs -        subtitles = self.getSubtitles() -        for lang in langs: -            self.assertTrue(subtitles.get(lang) is not None, u'Subtitles for \'%s\' not extracted' % lang) - -if __name__ == '__main__': -    unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 664b78662..84a539b82 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -4,12 +4,16 @@ import re  import subprocess  import sys  import time -import traceback -if os.name == 'nt': -    import ctypes - -from .utils import * +from .utils import ( +    compat_urllib_error, +    compat_urllib_request, +    ContentTooShortError, +    determine_ext, +    encodeFilename, +    sanitize_open, +    timeconvert, +)  class FileDownloader(object): @@ -144,16 +148,8 @@ class FileDownloader(object):      def to_stderr(self, message):          self.ydl.to_screen(message) -    def to_cons_title(self, message): -        """Set console/terminal window title to message.""" -        if not self.params.get('consoletitle', False): -            return -        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): -            # c_wchar_p() might not be necessary if `message` is -            # already of type unicode() -            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) -        elif 'TERM' in os.environ: -            self.to_screen('\033]0;%s\007' % message, skip_eol=True) +    def to_console_title(self, message): +        self.ydl.to_console_title(message)      def trouble(self, *args, **kargs):          self.ydl.trouble(*args, **kargs) @@ -194,7 +190,7 @@ class FileDownloader(object):              if old_filename == new_filename:                  return              os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) -        except (IOError, OSError) as err: +        except (IOError, OSError):              self.report_error(u'unable to rename file')      def try_utime(self, filename, last_modified_hdr): @@ -227,8 +223,14 @@ class FileDownloader(object):          if self.params.get('noprogress', False):              return          clear_line = (u'\x1b[K' if sys.stderr.isatty() and os.name != 'nt' else u'') -        eta_str = self.format_eta(eta) -        percent_str = self.format_percent(percent) +        if eta is not None: +            eta_str = self.format_eta(eta) +        else: +            eta_str = 'Unknown ETA' +        if percent is not None: +            percent_str = self.format_percent(percent) +        else: +            percent_str = 'Unknown %'          speed_str = self.format_speed(speed)          if self.params.get('progress_with_newline', False):              self.to_screen(u'[download] %s of %s at %s ETA %s' % @@ -236,7 +238,7 @@ class FileDownloader(object):          else:              self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %                  (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True) -        self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % +        self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' %                  (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))      def report_resuming_byte(self, resume_len): @@ -251,7 +253,7 @@ class FileDownloader(object):          """Report file has already been fully downloaded."""          try:              self.to_screen(u'[download] %s has already been downloaded' % file_name) -        except (UnicodeEncodeError) as err: +        except UnicodeEncodeError:              self.to_screen(u'[download] The file has already been downloaded')      def report_unable_to_resume(self): @@ -267,7 +269,7 @@ class FileDownloader(object):              self.to_screen(u'\r%s[download] 100%% of %s in %s' %                  (clear_line, data_len_str, self.format_seconds(tot_time))) -    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url): +    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):          def run_rtmpdump(args):              start = time.time()              resume_percent = None @@ -348,6 +350,8 @@ class FileDownloader(object):              basic_args += ['--tcUrl', url]          if test:              basic_args += ['--stop', '1'] +        if live: +            basic_args += ['--live']          args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]          if self.params.get('verbose', False):              try: @@ -422,15 +426,20 @@ class FileDownloader(object):          self.report_destination(filename)          tmpfilename = self.temp_name(filename) -        args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename] -        # Check for ffmpeg first -        try: -            subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) -        except (OSError, IOError): -            self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] ) -            return False +        args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy', +            '-bsf:a', 'aac_adtstoasc', tmpfilename] -        retval = subprocess.call(args) +        for program in ['avconv', 'ffmpeg']: +            try: +                subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) +                break +            except (OSError, IOError): +                pass +        else: +            self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found') +        cmd = [program] + args + +        retval = subprocess.call(cmd)          if retval == 0:              fsize = os.path.getsize(encodeFilename(tmpfilename))              self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) @@ -467,7 +476,8 @@ class FileDownloader(object):                                                  info_dict.get('player_url', None),                                                  info_dict.get('page_url', None),                                                  info_dict.get('play_path', None), -                                                info_dict.get('tc_url', None)) +                                                info_dict.get('tc_url', None), +                                                info_dict.get('rtmp_live', False))          # Attempt to download using mplayer          if url.startswith('mms') or url.startswith('rtsp'): @@ -606,12 +616,11 @@ class FileDownloader(object):              # Progress message              speed = self.calc_speed(start, time.time(), byte_counter - resume_len)              if data_len is None: -                self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') -                eta = None +                eta = percent = None              else:                  percent = self.calc_percent(byte_counter, data_len)                  eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) -                self.report_progress(percent, data_len_str, speed, eta) +            self.report_progress(percent, data_len_str, speed, eta)              self._hook_progress({                  'downloaded_bytes': byte_counter, diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index 13b56ede5..69aedf87a 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -501,7 +501,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):          options = ['-c', 'copy']          for (name, value) in metadata.items(): -            options.extend(['-metadata', '%s="%s"' % (name, value)]) +            options.extend(['-metadata', '%s=%s' % (name, value)])          options.extend(['-f', ext])          self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 313295839..20eed96ca 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -13,7 +13,34 @@ import sys  import time  import traceback -from .utils import * +if os.name == 'nt': +    import ctypes + +from .utils import ( +    compat_http_client, +    compat_print, +    compat_str, +    compat_urllib_error, +    compat_urllib_request, +    ContentTooShortError, +    date_from_str, +    DateRange, +    determine_ext, +    DownloadError, +    encodeFilename, +    ExtractorError, +    locked_file, +    MaxDownloadsReached, +    PostProcessingError, +    preferredencoding, +    SameFileError, +    sanitize_filename, +    subtitles_filename, +    takewhile_inclusive, +    UnavailableVideoError, +    write_json_file, +    write_string, +)  from .extractor import get_info_extractor, gen_extractors  from .FileDownloader import FileDownloader @@ -176,6 +203,35 @@ class YoutubeDL(object):              output = output.encode(preferredencoding())          sys.stderr.write(output) +    def to_console_title(self, message): +        if not self.params.get('consoletitle', False): +            return +        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): +            # c_wchar_p() might not be necessary if `message` is +            # already of type unicode() +            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) +        elif 'TERM' in os.environ: +            write_string(u'\033]0;%s\007' % message, self._screen_file) + +    def save_console_title(self): +        if not self.params.get('consoletitle', False): +            return +        if 'TERM' in os.environ: +            write_string(u'\033[22t', self._screen_file) + +    def restore_console_title(self): +        if not self.params.get('consoletitle', False): +            return +        if 'TERM' in os.environ: +            write_string(u'\033[23t', self._screen_file) + +    def __enter__(self): +        self.save_console_title() +        return self + +    def __exit__(self, *args): +        self.restore_console_title() +      def fixed_template(self):          """Checks if the output template is fixed."""          return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None) @@ -254,7 +310,7 @@ class YoutubeDL(object):          """Report file has already been fully downloaded."""          try:              self.to_screen(u'[download] %s has already been downloaded' % file_name) -        except (UnicodeEncodeError) as err: +        except UnicodeEncodeError:              self.to_screen(u'[download] The file has already been downloaded')      def increment_downloads(self): @@ -272,7 +328,7 @@ class YoutubeDL(object):                  autonumber_size = 5              autonumber_templ = u'%0' + str(autonumber_size) + u'd'              template_dict['autonumber'] = autonumber_templ % self._num_downloads -            if template_dict['playlist_index'] is not None: +            if template_dict.get('playlist_index') is not None:                  template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']              sanitize = lambda k, v: sanitize_filename( @@ -318,6 +374,12 @@ class YoutubeDL(object):                      % info_dict)          return None +    @staticmethod +    def add_extra_info(info_dict, extra_info): +        '''Set the keys from extra_info in info dict if they are missing''' +        for key, value in extra_info.items(): +            info_dict.setdefault(key, value) +      def extract_info(self, url, download=True, ie_key=None, extra_info={}):          '''          Returns a list with a dictionary for each video we find. @@ -344,17 +406,17 @@ class YoutubeDL(object):                      break                  if isinstance(ie_result, list):                      # Backwards compatibility: old IE result format -                    for result in ie_result: -                        result.update(extra_info)                      ie_result = {                          '_type': 'compat_list',                          'entries': ie_result,                      } -                else: -                    ie_result.update(extra_info) -                if 'extractor' not in ie_result: -                    ie_result['extractor'] = ie.IE_NAME -                return self.process_ie_result(ie_result, download=download) +                self.add_extra_info(ie_result, +                    { +                        'extractor': ie.IE_NAME, +                        'webpage_url': url, +                        'extractor_key': ie.ie_key(), +                    }) +                return self.process_ie_result(ie_result, download, extra_info)              except ExtractorError as de: # An error we somewhat expected                  self.report_error(compat_str(de), de.format_traceback())                  break @@ -378,8 +440,8 @@ class YoutubeDL(object):          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system          if result_type == 'video': -            ie_result.update(extra_info) -            return self.process_video_result(ie_result) +            self.add_extra_info(ie_result, extra_info) +            return self.process_video_result(ie_result, download=download)          elif result_type == 'url':              # We have to add extra_info to the results because it may be              # contained in a playlist @@ -388,6 +450,7 @@ class YoutubeDL(object):                                       ie_key=ie_result.get('ie_key'),                                       extra_info=extra_info)          elif result_type == 'playlist': +            self.add_extra_info(ie_result, extra_info)              # We process each entry in the playlist              playlist = ie_result.get('title', None) or ie_result.get('id', None)              self.to_screen(u'[download] Downloading playlist: %s' % playlist) @@ -413,12 +476,10 @@ class YoutubeDL(object):                  extra = {                      'playlist': playlist,                      'playlist_index': i + playliststart, +                    'extractor': ie_result['extractor'], +                    'webpage_url': ie_result['webpage_url'], +                    'extractor_key': ie_result['extractor_key'],                  } -                if not 'extractor' in entry: -                    # We set the extractor, if it's an url it will be set then to -                    # the new extractor, but if it's already a video we must make -                    # sure it's present: see issue #877 -                    entry['extractor'] = ie_result['extractor']                  entry_result = self.process_ie_result(entry,                                                        download=download,                                                        extra_info=extra) @@ -427,10 +488,15 @@ class YoutubeDL(object):              return ie_result          elif result_type == 'compat_list':              def _fixup(r): -                r.setdefault('extractor', ie_result['extractor']) +                self.add_extra_info(r, +                    { +                        'extractor': ie_result['extractor'], +                        'webpage_url': ie_result['webpage_url'], +                        'extractor_key': ie_result['extractor_key'], +                    })                  return r              ie_result['entries'] = [ -                self.process_ie_result(_fixup(r), download=download) +                self.process_ie_result(_fixup(r), download, extra_info)                  for r in ie_result['entries']              ]              return ie_result @@ -482,7 +548,7 @@ class YoutubeDL(object):                  format['format'] = u'{id} - {res}{note}'.format(                      id=format['format_id'],                      res=self.format_resolution(format), -                    note=u' ({})'.format(format['format_note']) if format.get('format_note') is not None else '', +                    note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',                  )              # Automatically determine file extension if missing              if 'ext' not in format: @@ -630,7 +696,7 @@ class YoutubeDL(object):              # subtitles download errors are already managed as troubles in relevant IE              # that way it will silently go on when used with unsupporting IE              subtitles = info_dict['subtitles'] -            sub_format = self.params.get('subtitlesformat') +            sub_format = self.params.get('subtitlesformat', 'srt')              for sub_lang in subtitles.keys():                  sub = subtitles[sub_lang]                  if sub is None: @@ -759,6 +825,8 @@ class YoutubeDL(object):      @staticmethod      def format_resolution(format, default='unknown'): +        if format.get('_resolution') is not None: +            return format['_resolution']          if format.get('height') is not None:              if format.get('width') is not None:                  res = u'%sx%s' % (format['width'], format['height']) @@ -769,19 +837,45 @@ class YoutubeDL(object):          return res      def list_formats(self, info_dict): -        formats_s = [] -        for format in info_dict.get('formats', [info_dict]): -            formats_s.append(u'%-15s%-7s     %-15s%s' % ( +        def format_note(fdict): +            if fdict.get('format_note') is not None: +                return fdict['format_note'] +            res = u'' +            if fdict.get('vcodec') is not None: +                res += u'%-5s' % fdict['vcodec'] +            elif fdict.get('vbr') is not None: +                res += u'video' +            if fdict.get('vbr') is not None: +                res += u'@%4dk' % fdict['vbr'] +            if fdict.get('acodec') is not None: +                if res: +                    res += u', ' +                res += u'%-5s' % fdict['acodec'] +            elif fdict.get('abr') is not None: +                if res: +                    res += u', ' +                res += 'audio' +            if fdict.get('abr') is not None: +                res += u'@%3dk' % fdict['abr'] +            return res + +        def line(format): +            return (u'%-20s%-10s%-12s%s' % (                  format['format_id'],                  format['ext'], -                format.get('format_note', ''),                  self.format_resolution(format), +                format_note(format),                  )              ) -        if len(formats_s) != 1: -            formats_s[0] += ' (worst)' -            formats_s[-1] += ' (best)' -        formats_s = "\n".join(formats_s) -        self.to_screen(u'[info] Available formats for %s:\n' -            u'format code    extension   note           resolution\n%s' % ( -                info_dict['id'], formats_s)) + +        formats = info_dict.get('formats', [info_dict]) +        formats_s = list(map(line, formats)) +        if len(formats) > 1: +            formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' +            formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' + +        header_line = line({ +            'format_id': u'format code', 'ext': u'extension', +            '_resolution': u'resolution', 'format_note': u'note'}) +        self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % +                       (info_dict['id'], header_line, u"\n".join(formats_s))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 48ffcbf8e..af4c9c5c4 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -32,6 +32,8 @@ __authors__  = (      'Ismael Mejía',      'Steffan \'Ruirize\' James',      'Andras Elso', +    'Jelle van der Waa', +    'Marcin Cieślak',  )  __license__ = 'Public Domain' @@ -349,7 +351,7 @@ def parseOpts(overrideArguments=None):                    'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))      filesystem.add_option('--autonumber-size',              dest='autonumber_size', metavar='NUMBER', -            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --autonumber option is given') +            help='Specifies the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given')      filesystem.add_option('--restrict-filenames',              action='store_true', dest='restrictfilenames',              help='Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames', default=False) @@ -358,7 +360,7 @@ def parseOpts(overrideArguments=None):      filesystem.add_option('-w', '--no-overwrites',              action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)      filesystem.add_option('-c', '--continue', -            action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True) +            action='store_true', dest='continue_dl', help='force resume of partially downloaded files. By default, youtube-dl will resume downloads if possible.', default=True)      filesystem.add_option('--no-continue',              action='store_false', dest='continue_dl',              help='do not resume partially downloaded files (restart from beginning)') @@ -601,8 +603,7 @@ def _real_main(argv=None):                       u' file! Use "%%(ext)s" instead of %r' %                       determine_ext(outtmpl, u'')) -    # YoutubeDL -    ydl = YoutubeDL({ +    ydl_opts = {          'usenetrc': opts.usenetrc,          'username': opts.username,          'password': opts.password, @@ -665,61 +666,63 @@ def _real_main(argv=None):          'youtube_print_sig_code': opts.youtube_print_sig_code,          'age_limit': opts.age_limit,          'download_archive': opts.download_archive, -        }) +    } -    if opts.verbose: -        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') -        try: -            sp = subprocess.Popen( -                ['git', 'rev-parse', '--short', 'HEAD'], -                stdout=subprocess.PIPE, stderr=subprocess.PIPE, -                cwd=os.path.dirname(os.path.abspath(__file__))) -            out, err = sp.communicate() -            out = out.decode().strip() -            if re.match('[0-9a-f]+', out): -                write_string(u'[debug] Git HEAD: ' + out + u'\n') -        except: +    with YoutubeDL(ydl_opts) as ydl: +        if opts.verbose: +            write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')              try: -                sys.exc_clear() +                sp = subprocess.Popen( +                    ['git', 'rev-parse', '--short', 'HEAD'], +                    stdout=subprocess.PIPE, stderr=subprocess.PIPE, +                    cwd=os.path.dirname(os.path.abspath(__file__))) +                out, err = sp.communicate() +                out = out.decode().strip() +                if re.match('[0-9a-f]+', out): +                    write_string(u'[debug] Git HEAD: ' + out + u'\n')              except: -                pass -        write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n') - -        proxy_map = {} -        for handler in opener.handlers: -            if hasattr(handler, 'proxies'): -                proxy_map.update(handler.proxies) -        write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') - -    ydl.add_default_info_extractors() - -    # PostProcessors -    # Add the metadata pp first, the other pps will copy it -    if opts.addmetadata: -        ydl.add_post_processor(FFmpegMetadataPP()) -    if opts.extractaudio: -        ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) -    if opts.recodevideo: -        ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo)) -    if opts.embedsubtitles: -        ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) - -    # Update version -    if opts.update_self: -        update_self(ydl.to_screen, opts.verbose) - -    # Maybe do nothing -    if len(all_urls) < 1: -        if not opts.update_self: -            parser.error(u'you must provide at least one URL') -        else: -            sys.exit() +                try: +                    sys.exc_clear() +                except: +                    pass +            write_string(u'[debug] Python version %s - %s' % +                         (platform.python_version(), platform_name()) + u'\n') + +            proxy_map = {} +            for handler in opener.handlers: +                if hasattr(handler, 'proxies'): +                    proxy_map.update(handler.proxies) +            write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') + +        ydl.add_default_info_extractors() + +        # PostProcessors +        # Add the metadata pp first, the other pps will copy it +        if opts.addmetadata: +            ydl.add_post_processor(FFmpegMetadataPP()) +        if opts.extractaudio: +            ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites)) +        if opts.recodevideo: +            ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo)) +        if opts.embedsubtitles: +            ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) + +        # Update version +        if opts.update_self: +            update_self(ydl.to_screen, opts.verbose) + +        # Maybe do nothing +        if len(all_urls) < 1: +            if not opts.update_self: +                parser.error(u'you must provide at least one URL') +            else: +                sys.exit() -    try: -        retcode = ydl.download(all_urls) -    except MaxDownloadsReached: -        ydl.to_screen(u'--max-download limit reached, aborting.') -        retcode = 101 +        try: +            retcode = ydl.download(all_urls) +        except MaxDownloadsReached: +            ydl.to_screen(u'--max-download limit reached, aborting.') +            retcode = 101      # Dump cookie jar if requested      if opts.cookiefile is not None: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0d933986f..2d1e3cdfd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,6 +9,7 @@ from .arte import (      ArteTVFutureIE,  )  from .auengine import AUEngineIE +from .bambuser import BambuserIE, BambuserChannelIE  from .bandcamp import BandcampIE  from .bliptv import BlipTVIE, BlipTVUserIE  from .bloomberg import BloombergIE @@ -37,8 +38,10 @@ from .defense import DefenseGouvFrIE  from .ebaumsworld import EbaumsWorldIE  from .ehow import EHowIE  from .eighttracks import EightTracksIE +from .eitb import EitbIE  from .escapist import EscapistIE  from .exfm import ExfmIE +from .extremetube import ExtremeTubeIE  from .facebook import FacebookIE  from .faz import FazIE  from .fktv import ( @@ -54,6 +57,7 @@ from .francetv import (  )  from .freesound import FreesoundIE  from .funnyordie import FunnyOrDieIE +from .gamekings import GamekingsIE  from .gamespot import GameSpotIE  from .gametrailers import GametrailersIE  from .generic import GenericIE @@ -76,13 +80,15 @@ from .keezmovies import KeezMoviesIE  from .kickstarter import KickStarterIE  from .keek import KeekIE  from .liveleak import LiveLeakIE -from .livestream import LivestreamIE +from .livestream import LivestreamIE, LivestreamOriginalIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE  from .mit import TechTVMITIE, MITIE  from .mixcloud import MixcloudIE +from .mofosex import MofosexIE  from .mtv import MTVIE  from .muzu import MuzuTVIE +from .myspace import MySpaceIE  from .myspass import MySpassIE  from .myvideo import MyVideoIE  from .naver import NaverIE @@ -110,7 +116,11 @@ from .slashdot import SlashdotIE  from .slideshare import SlideshareIE  from .sohu import SohuIE  from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE -from .southparkstudios import SouthParkStudiosIE +from .southparkstudios import ( +    SouthParkStudiosIE, +    SouthparkDeIE, +) +from .space import SpaceIE  from .spankwire import SpankwireIE  from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE @@ -128,6 +138,7 @@ from .tube8 import Tube8IE  from .tudou import TudouIE  from .tumblr import TumblrIE  from .tutv import TutvIE +from .tvp import TvpIE  from .unistra import UnistraIE  from .ustream import UstreamIE, UstreamChannelIE  from .vbox7 import Vbox7IE @@ -141,6 +152,7 @@ from .videofyme import VideofyMeIE  from .videopremium import VideoPremiumIE  from .vimeo import VimeoIE, VimeoChannelIE  from .vine import VineIE +from .vk import VKIE  from .wat import WatIE  from .websurg import WeBSurgIE  from .weibo import WeiboIE @@ -149,6 +161,7 @@ from .worldstarhiphop import WorldStarHipHopIE  from .xhamster import XHamsterIE  from .xnxx import XNXXIE  from .xvideos import XVideosIE +from .xtube import XTubeIE  from .yahoo import YahooIE, YahooSearchIE  from .youjizz import YouJizzIE  from .youku import YoukuIE @@ -157,6 +170,7 @@ from .youtube import (      YoutubeIE,      YoutubePlaylistIE,      YoutubeSearchIE, +    YoutubeSearchDateIE,      YoutubeUserIE,      YoutubeChannelIE,      YoutubeShowIE, diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index d39b48951..44d0b5d70 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -10,6 +10,7 @@ from ..utils import (      unified_strdate,      determine_ext,      get_element_by_id, +    compat_str,  )  # There are different sources of video in arte.tv, the extraction process  @@ -68,7 +69,7 @@ class ArteTvIE(InfoExtractor):              lang = mobj.group('lang')              return self._extract_liveweb(url, name, lang) -        if re.search(self._LIVE_URL, video_id) is not None: +        if re.search(self._LIVE_URL, url) is not None:              raise ExtractorError(u'Arte live streams are not yet supported, sorry')              # self.extractLiveStream(url)              # return @@ -114,7 +115,7 @@ class ArteTvIE(InfoExtractor):          event_doc = config_doc.find('event')          url_node = event_doc.find('video').find('urlHd')          if url_node is None: -            url_node = video_doc.find('urlSd') +            url_node = event_doc.find('urlSd')          return {'id': video_id,                  'title': event_doc.find('name%s' % lang.capitalize()).text, @@ -158,7 +159,9 @@ class ArteTVPlus7IE(InfoExtractor):              'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),          } -        formats = player_info['VSR'].values() +        all_formats = player_info['VSR'].values() +        # Some formats use the m3u8 protocol +        all_formats = list(filter(lambda f: f.get('videoFormat') != 'M3U8', all_formats))          def _match_lang(f):              if f.get('versionCode') is None:                  return True @@ -170,24 +173,39 @@ class ArteTVPlus7IE(InfoExtractor):              regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]              return any(re.match(r, f['versionCode']) for r in regexes)          # Some formats may not be in the same language as the url -        formats = filter(_match_lang, formats) -        # Some formats use the m3u8 protocol -        formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) -        # We order the formats by quality +        formats = filter(_match_lang, all_formats)          formats = list(formats) # in python3 filter returns an iterator +        if not formats: +            # Some videos are only available in the 'Originalversion' +            # they aren't tagged as being in French or German +            if all(f['versionCode'] == 'VO' for f in all_formats): +                formats = all_formats +            else: +                raise ExtractorError(u'The formats list is empty') +          if re.match(r'[A-Z]Q', formats[0]['quality']) is not None: -            sort_key = lambda f: ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) +            def sort_key(f): +                return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])          else: -            sort_key = lambda f: int(f.get('height',-1)) +            def sort_key(f): +                return ( +                    # Sort first by quality +                    int(f.get('height',-1)), +                    int(f.get('bitrate',-1)), +                    # The original version with subtitles has lower relevance +                    re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, +                    # The version with sourds/mal subtitles has also lower relevance +                    re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, +                )          formats = sorted(formats, key=sort_key) -        # Prefer videos without subtitles in the same language -        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) -        # Pick the best quality          def _format(format_info): -            quality = format_info['quality'] -            m_quality = re.match(r'\w*? - (\d*)p', quality) -            if m_quality is not None: -                quality = m_quality.group(1) +            quality = '' +            height = format_info.get('height') +            if height is not None: +                quality = compat_str(height) +            bitrate = format_info.get('bitrate') +            if bitrate is not None: +                quality += '-%d' % bitrate              if format_info.get('versionCode') is not None:                  format_id = u'%s-%s' % (quality, format_info['versionCode'])              else: @@ -196,7 +214,7 @@ class ArteTVPlus7IE(InfoExtractor):                  'format_id': format_id,                  'format_note': format_info.get('versionLibelle'),                  'width': format_info.get('width'), -                'height': format_info.get('height'), +                'height': height,              }              if format_info['mediaType'] == u'rtmp':                  info['url'] = format_info['streamer'] diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py new file mode 100644 index 000000000..967568c4a --- /dev/null +++ b/youtube_dl/extractor/bambuser.py @@ -0,0 +1,81 @@ +import re +import json +import itertools + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_request, +) + + +class BambuserIE(InfoExtractor): +    IE_NAME = u'bambuser' +    _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)' +    _API_KEY = '005f64509e19a868399060af746a00aa' + +    _TEST = { +        u'url': u'http://bambuser.com/v/4050584', +        # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388 +        #u'md5': u'fba8f7693e48fd4e8641b3fd5539a641', +        u'info_dict': { +            u'id': u'4050584', +            u'ext': u'flv', +            u'title': u'Education engineering days - lightning talks', +            u'duration': 3741, +            u'uploader': u'pixelversity', +            u'uploader_id': u'344706', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        info_url = ('http://player-c.api.bambuser.com/getVideo.json?' +            '&api_key=%s&vid=%s' % (self._API_KEY, video_id)) +        info_json = self._download_webpage(info_url, video_id) +        info = json.loads(info_json)['result'] + +        return { +            'id': video_id, +            'title': info['title'], +            'url': info['url'], +            'thumbnail': info.get('preview'), +            'duration': int(info['length']), +            'view_count': int(info['views_total']), +            'uploader': info['username'], +            'uploader_id': info['uid'], +        } + + +class BambuserChannelIE(InfoExtractor): +    IE_NAME = u'bambuser:channel' +    _VALID_URL = r'http://bambuser.com/channel/(?P<user>.*?)(?:/|#|\?|$)' +    # The maximum number we can get with each request +    _STEP = 50 + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        user = mobj.group('user') +        urls = [] +        last_id = '' +        for i in itertools.count(1): +            req_url = ('http://bambuser.com/xhr-api/index.php?username={user}' +                '&sort=created&access_mode=0%2C1%2C2&limit={count}' +                '&method=broadcast&format=json&vid_older_than={last}' +                ).format(user=user, count=self._STEP, last=last_id) +            req = compat_urllib_request.Request(req_url) +            # Without setting this header, we wouldn't get any result +            req.add_header('Referer', 'http://bambuser.com/channel/%s' % user) +            info_json = self._download_webpage(req, user, +                u'Downloading page %d' % i) +            results = json.loads(info_json)['result'] +            if len(results) == 0: +                break +            last_id = results[-1]['vid'] +            urls.extend(self.url_result(v['page'], 'Bambuser') for v in results) + +        return { +            '_type': 'playlist', +            'title': user, +            'entries': urls, +        } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 1392f382a..d8c35465a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,10 +9,13 @@ from ..utils import (      compat_urllib_parse,      find_xpath_attr,      compat_urlparse, +    compat_str, +    compat_urllib_request,      ExtractorError,  ) +  class BrightcoveIE(InfoExtractor):      _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'      _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' @@ -23,7 +26,7 @@ class BrightcoveIE(InfoExtractor):              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/              u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001',              u'file': u'2371591881001.mp4', -            u'md5': u'9e80619e0a94663f0bdc849b4566af19', +            u'md5': u'8eccab865181d29ec2958f32a6a754f5',              u'note': u'Test Brightcove downloads and detection in GenericIE',              u'info_dict': {                  u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', @@ -41,6 +44,17 @@ class BrightcoveIE(InfoExtractor):                  u'uploader': u'Oracle',              },          }, +        { +            # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ +            u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', +            u'info_dict': { +                u'id': u'2750934548001', +                u'ext': u'mp4', +                u'title': u'This Bracelet Acts as a Personal Thermostat', +                u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0', +                u'uploader': u'Mashable', +            }, +        },      ]      @classmethod @@ -68,24 +82,48 @@ class BrightcoveIE(InfoExtractor):          videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')          if videoPlayer is not None:              params['@videoPlayer'] = videoPlayer.attrib['value'] +        linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') +        if linkBase is not None: +            params['linkBaseURL'] = linkBase.attrib['value']          data = compat_urllib_parse.urlencode(params)          return cls._FEDERATED_URL_TEMPLATE % data +    @classmethod +    def _extract_brightcove_url(cls, webpage): +        """Try to extract the brightcove url from the wepbage, returns None +        if it can't be found +        """ +        m_brightcove = re.search( +            r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', +            webpage, re.DOTALL) +        if m_brightcove is not None: +            return cls._build_brighcove_url(m_brightcove.group()) +        else: +            return None +      def _real_extract(self, url): +        # Change the 'videoId' and others field to '@videoPlayer' +        url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) +        # Change bckey (used by bcove.me urls) to playerKey +        url = re.sub(r'(?<=[?&])bckey', 'playerKey', url)          mobj = re.match(self._VALID_URL, url)          query_str = mobj.group('query')          query = compat_urlparse.parse_qs(query_str)          videoPlayer = query.get('@videoPlayer')          if videoPlayer: -            return self._get_video_info(videoPlayer[0], query_str) +            return self._get_video_info(videoPlayer[0], query_str, query)          else:              player_key = query['playerKey']              return self._get_playlist_info(player_key[0]) -    def _get_video_info(self, video_id, query): -        request_url = self._FEDERATED_URL_TEMPLATE % query -        webpage = self._download_webpage(request_url, video_id) +    def _get_video_info(self, video_id, query_str, query): +        request_url = self._FEDERATED_URL_TEMPLATE % query_str +        req = compat_urllib_request.Request(request_url) +        linkBase = query.get('linkBaseURL') +        if linkBase is not None: +            req.add_header('Referer', linkBase[0]) +        webpage = self._download_webpage(req, video_id)          self.report_extraction(video_id)          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') @@ -109,7 +147,7 @@ class BrightcoveIE(InfoExtractor):      def _extract_video_info(self, video_info):          info = { -            'id': video_info['id'], +            'id': compat_str(video_info['id']),              'title': video_info['displayName'],              'description': video_info.get('shortDescription'),              'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -119,15 +157,14 @@ class BrightcoveIE(InfoExtractor):          renditions = video_info.get('renditions')          if renditions:              renditions = sorted(renditions, key=lambda r: r['size']) -            best_format = renditions[-1] -            info.update({ -                'url': best_format['defaultURL'], -                'ext': 'mp4', -            }) +            info['formats'] = [{ +                'url': rend['defaultURL'], +                'height': rend.get('frameHeight'), +                'width': rend.get('frameWidth'), +            } for rend in renditions]          elif video_info.get('FLVFullLengthURL') is not None:              info.update({                  'url': video_info['FLVFullLengthURL'], -                'ext': 'flv',              })          else:              raise ExtractorError(u'Unable to extract video url for %s' % info['id']) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index e7f4fa9fd..3d8d7f9d2 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -6,7 +6,7 @@ from .common import InfoExtractor  class Canalc2IE(InfoExtractor):      IE_NAME = 'canalc2.tv' -    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?idVideo=(\d+)&voir=oui' +    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'      _TEST = {          u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', @@ -18,7 +18,9 @@ class Canalc2IE(InfoExtractor):      }      def _real_extract(self, url): -        video_id = re.match(self._VALID_URL, url).group(1) +        video_id = re.match(self._VALID_URL, url).group('id') +        # We need to set the voir field for getting the file name +        url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id          webpage = self._download_webpage(url, video_id)          file_name = self._search_regex(              r"so\.addVariable\('file','(.*?)'\);", diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 2fe1033f0..f0d08cebf 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -41,7 +41,7 @@ class CinemassacreIE(InfoExtractor):          webpage_url = u'http://' + mobj.group('url')          webpage = self._download_webpage(webpage_url, None) # Don't know video id yet          video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') -        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) +        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)          if not mobj:              raise ExtractorError(u'Can\'t extract embed url and video id')          playerdata_url = mobj.group(u'embed_url') @@ -65,6 +65,7 @@ class CinemassacreIE(InfoExtractor):              {                  'url': url,                  'play_path': 'mp4:' + sd_file, +                'rtmp_live': True, # workaround                  'ext': 'flv',                  'format': 'sd',                  'format_id': 'sd', @@ -72,6 +73,7 @@ class CinemassacreIE(InfoExtractor):              {                  'url': url,                  'play_path': 'mp4:' + hd_file, +                'rtmp_live': True, # workaround                  'ext': 'flv',                  'format': 'hd',                  'format_id': 'hd', diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a79f881cd..34adf6dda 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -6,7 +6,7 @@ from ..utils import determine_ext  class CNNIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://(edition\.)?cnn\.com/video/(data/.+?|\?)/ +    _VALID_URL = r'''(?x)https?://((edition|www)\.)?cnn\.com/video/(data/.+?|\?)/          (?P<path>.+?/(?P<title>[^/]+?)(?:\.cnn|(?=&)))'''      _TESTS = [{ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ce349fe20..f787d0a3c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,7 +63,7 @@ class InfoExtractor(object):                      * ext       Will be calculated from url if missing                      * format    A human-readable description of the format                                  ("mp4 container with h264/opus"). -                                Calculated from the format_id, width, height  +                                Calculated from the format_id, width, height.                                  and format_note fields if missing.                      * format_id A short description of the format                                  ("mp4_h264_opus" or "19") @@ -71,6 +71,13 @@ class InfoExtractor(object):                                  ("3D" or "DASH video")                      * width     Width of the video, if known                      * height    Height of the video, if known +                    * abr       Average audio bitrate in KBit/s +                    * acodec    Name of the audio codec in use +                    * vbr       Average video bitrate in KBit/s +                    * vcodec    Name of the video codec in use +    webpage_url:    The url to the video webpage, if given to youtube-dl it +                    should allow to get the same result again. (It will be set +                    by YoutubeDL if it's missing)      Unless mentioned otherwise, the fields should be Unicode strings. @@ -312,13 +319,21 @@ class InfoExtractor(object):      # Helper functions for extracting OpenGraph info      @staticmethod -    def _og_regex(prop): -        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) +    def _og_regexes(prop): +        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' +        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) +        template = r'<meta[^>]+?%s[^>]+?%s' +        return [ +            template % (property_re, content_re), +            template % (content_re, property_re), +        ]      def _og_search_property(self, prop, html, name=None, **kargs):          if name is None:              name = 'OpenGraph %s' % prop -        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) +        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) +        if escaped is None: +            return None          return unescapeHTML(escaped)      def _og_search_thumbnail(self, html, **kargs): @@ -331,8 +346,8 @@ class InfoExtractor(object):          return self._og_search_property('title', html, **kargs)      def _og_search_video_url(self, html, name='video url', secure=True, **kargs): -        regexes = [self._og_regex('video')] -        if secure: regexes.insert(0, self._og_regex('video:secure_url')) +        regexes = self._og_regexes('video') +        if secure: regexes = self._og_regexes('video:secure_url') + regexes          return self._html_search_regex(regexes, html, name, **kargs)      def _rta_search(self, html): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 4c0488245..71f5e03ee 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -21,6 +21,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):          """Build a request with the family filter disabled"""          request = compat_urllib_request.Request(url)          request.add_header('Cookie', 'family_filter=off') +        request.add_header('Cookie', 'ff=off')          return request  class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor): @@ -61,6 +62,18 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              },              u'skip': u'VEVO is only available in some countries',          }, +        # age-restricted video +        { +            u'url': u'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', +            u'file': u'xyh2zz.mp4', +            u'md5': u'0d667a7b9cebecc3c89ee93099c4159d', +            u'info_dict': { +                u'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', +                u'uploader': 'HotWaves1012', +                u'age_limit': 18, +            } + +        }      ]      def _real_extract(self, url): @@ -90,7 +103,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',                                               # Looking for official user                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], -                                            webpage, 'video uploader') +                                            webpage, 'video uploader', fatal=False) +        age_limit = self._rta_search(webpage)          video_upload_date = None          mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage) @@ -127,22 +141,23 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):              raise ExtractorError(u'Unable to extract video URL')          # subtitles -        video_subtitles = self.extract_subtitles(video_id) +        video_subtitles = self.extract_subtitles(video_id, webpage)          if self._downloader.params.get('listsubtitles', False): -            self._list_available_subtitles(video_id) +            self._list_available_subtitles(video_id, webpage)              return -        return [{ +        return {              'id':       video_id,              'formats': formats,              'uploader': video_uploader,              'upload_date':  video_upload_date,              'title':    self._og_search_title(webpage),              'subtitles':    video_subtitles, -            'thumbnail': info['thumbnail_url'] -        }] +            'thumbnail': info['thumbnail_url'], +            'age_limit': age_limit, +        } -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          try:              sub_list = self._download_webpage(                  'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id, @@ -171,7 +186,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):              webpage = self._download_webpage(request,                                               id, u'Downloading page %s' % pagenum) -            playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) +            playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)              video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py index d43348955..2c9fb5f2e 100644 --- a/youtube_dl/extractor/depositfiles.py +++ b/youtube_dl/extractor/depositfiles.py @@ -25,7 +25,7 @@ class DepositFilesIE(InfoExtractor):          url = 'http://depositfiles.com/en/files/' + file_id          # Retrieve file webpage with 'Free download' button pressed -        free_download_indication = { 'gateway_result' : '1' } +        free_download_indication = {'gateway_result' : '1'}          request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))          try:              self.report_download_webpage(file_id) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py new file mode 100644 index 000000000..4ba323148 --- /dev/null +++ b/youtube_dl/extractor/eitb.py @@ -0,0 +1,37 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import ExtractorError + + +class EitbIE(InfoExtractor): +    IE_NAME = u'eitb.tv' +    _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + +    _TEST = { +        u'add_ie': ['Brightcove'], +        u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', +        u'md5': u'edf4436247185adee3ea18ce64c47998', +        u'info_dict': { +            u'id': u'2743577154001', +            u'ext': u'mp4', +            u'title': u'60 minutos (Lasa y Zabala, 30 años)', +            # All videos from eitb has this description in the brightcove info +            u'description': u'.', +            u'uploader': u'Euskal Telebista', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        chapter_id = mobj.group('chapter_id') +        webpage = self._download_webpage(url, chapter_id) +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if bc_url is None: +            raise ExtractorError(u'Could not extract the Brightcove url') +        # The BrightcoveExperience object doesn't contain the video id, we set +        # it manually +        bc_url += '&%40videoPlayer={0}'.format(chapter_id) +        return self.url_result(bc_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index c74556579..a51d79b08 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -21,6 +21,7 @@ class ExfmIE(InfoExtractor):                  u'description': u'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive',              },              u'note': u'Soundcloud song', +            u'skip': u'The site is down too often',          },          {              u'url': u'http://ex.fm/song/wddt8', @@ -30,6 +31,7 @@ class ExfmIE(InfoExtractor):                  u'title': u'Safe and Sound',                  u'uploader': u'Capital Cities',              }, +            u'skip': u'The site is down too often',          },      ] diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py new file mode 100644 index 000000000..1c20e4364 --- /dev/null +++ b/youtube_dl/extractor/extremetube.py @@ -0,0 +1,50 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urllib_request, +    compat_urllib_parse, +) + +class ExtremeTubeIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)' +    _TEST = { +        u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', +        u'file': u'652431.mp4', +        u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0', +        u'info_dict': { +            u"title": u"Music Video 14 british euro brit european cumshots swallow", +            u"uploader": u"unknown", +            u"age_limit": 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('videoid') +        url = 'http://www.' + mobj.group('url') + +        req = compat_urllib_request.Request(url) +        req.add_header('Cookie', 'age_verified=1') +        webpage = self._download_webpage(req, video_id) + +        video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title') +        uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False) +        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&', webpage, u'video_url')) +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:] +        format = path.split('/')[5].split('_')[:2] +        format = "-".join(format) + +        return { +            'id': video_id, +            'title': video_title, +            'uploader': uploader, +            'url': video_url, +            'ext': extension, +            'format': format, +            'format_id': format, +            'age_limit': 18, +        } diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py new file mode 100644 index 000000000..c91669b0e --- /dev/null +++ b/youtube_dl/extractor/gamekings.py @@ -0,0 +1,38 @@ +import re + +from .common import InfoExtractor + + +class GamekingsIE(InfoExtractor): +    _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)' +    _TEST = { +        u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/", +        u'file': u'20130811.mp4', +        # MD5 is flaky, seems to change regularly +        #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3', +        u'info_dict': { +            u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review", +            u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.", +        } +    } + +    def _real_extract(self, url): + +        mobj = re.match(self._VALID_URL, url) +        name = mobj.group('name') +        webpage = self._download_webpage(url, name) +        video_url = self._og_search_video_url(webpage) + +        video = re.search(r'[0-9]+', video_url) +        video_id = video.group(0) + +        # Todo: add medium format +        video_url = video_url.replace(video_id, 'large/' + video_id) + +        return { +            'id': video_id, +            'ext': 'mp4', +            'url': video_url, +            'title': self._og_search_title(webpage), +            'description': self._og_search_description(webpage), +        } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2c8fcf5ae..c7552fddb 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -33,6 +33,7 @@ class GenericIE(InfoExtractor):          },          # embedded vimeo video          { +            u'add_ie': ['Vimeo'],              u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references',              u'file': u'22444065.mp4',              u'md5': u'2903896e23df39722c33f015af0666e2', @@ -44,6 +45,7 @@ class GenericIE(InfoExtractor):          },          # bandcamp page with custom domain          { +            u'add_ie': ['Bandcamp'],              u'url': u'http://bronyrock.com/track/the-pony-mash',              u'file': u'3235767654.mp3',              u'info_dict': { @@ -52,6 +54,23 @@ class GenericIE(InfoExtractor):              },              u'skip': u'There is a limit of 200 free downloads / month for the test song',          }, +        # embedded brightcove video +        # it also tests brightcove videos that need to set the 'Referer' in the +        # http requests +        { +            u'add_ie': ['Brightcove'], +            u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', +            u'info_dict': { +                u'id': u'2765128793001', +                u'ext': u'mp4', +                u'title': u'Le cours de bourse : l’analyse technique', +                u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', +                u'uploader': u'BFM BUSINESS', +            }, +            u'params': { +                u'skip_download': True, +            }, +        },      ]      def report_download_webpage(self, video_id): @@ -144,10 +163,9 @@ class GenericIE(InfoExtractor):          self.report_extraction(video_id)          # Look for BrightCove: -        m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) -        if m_brightcove is not None: +        bc_url = BrightcoveIE._extract_brightcove_url(webpage) +        if bc_url is not None:              self.to_screen(u'Brightcove video detected.') -            bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())              return self.url_result(bc_url, 'Brightcove')          # Look for embedded Vimeo player @@ -160,9 +178,9 @@ class GenericIE(InfoExtractor):          # Look for embedded YouTube player          mobj = re.search( -            r'<iframe[^>]+?src="(https?://(?:www\.)?youtube.com/embed/.+?)"', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)          if mobj: -            surl = unescapeHTML(mobj.group(1)) +            surl = unescapeHTML(mobj.group(u'url'))              return self.url_result(surl, 'Youtube')          # Look for Bandcamp pages with custom domain diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index ab2b59103..9bd06e7c7 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -30,7 +30,7 @@ class HypemIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          track_id = mobj.group(1) -        data = { 'ax': 1, 'ts': time.time() } +        data = {'ax': 1, 'ts': time.time()}          data_encoded = compat_urllib_parse.urlencode(data)          complete_url = url + "?" + data_encoded          request = compat_urllib_request.Request(complete_url) @@ -68,4 +68,4 @@ class HypemIE(InfoExtractor):              'ext':      "mp3",              'title':    title,              'artist':   artist, -        }]
\ No newline at end of file +        }] diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py index 445d46501..50916f4a6 100644 --- a/youtube_dl/extractor/kankan.py +++ b/youtube_dl/extractor/kankan.py @@ -1,8 +1,10 @@  import re +import hashlib  from .common import InfoExtractor  from ..utils import determine_ext +_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()  class KankanIE(InfoExtractor):      _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' @@ -30,7 +32,10 @@ class KankanIE(InfoExtractor):                                                   video_id, u'Downloading video url info')          ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')          path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') -        video_url = 'http://%s%s' % (ip, path) +        param1 = self._search_regex(r'param1:(\d+)', video_info_page, u'param1') +        param2 = self._search_regex(r'param2:(\d+)', video_info_page, u'param2') +        key = _md5('xl_mp43651' + param1 + param2) +        video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2)          return {'id': video_id,                  'title': title, diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 5e05900da..29658a7d6 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -12,7 +12,7 @@ from ..aes import (  )  class KeezMoviesIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))' +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'      _TEST = {          u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',          u'file': u'1214711.mp4', @@ -43,10 +43,10 @@ class KeezMoviesIE(InfoExtractor):          if webpage.find('encrypted=true')!=-1:              password = self._html_search_regex(r'video_title=(.+?)&', webpage, u'password')              video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') -        path = compat_urllib_parse_urlparse( video_url ).path -        extension = os.path.splitext( path )[1][1:] +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:]          format = path.split('/')[4].split('_')[:2] -        format = "-".join( format ) +        format = "-".join(format)          age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index d04da98c8..1a3e0ae6b 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,16 +1,19 @@  import re  import json +import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import (      compat_urllib_parse_urlparse,      compat_urlparse,      get_meta_content, +    xpath_with_ns,      ExtractorError,  )  class LivestreamIE(InfoExtractor): +    IE_NAME = u'livestream'      _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'      _TEST = {          u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', @@ -40,13 +43,9 @@ class LivestreamIE(InfoExtractor):          if video_id is None:              # This is an event page: -            player = get_meta_content('twitter:player', webpage) -            if player is None: -                raise ExtractorError('Couldn\'t extract event api url') -            api_url = player.replace('/player', '') -            api_url = re.sub(r'^(https?://)(new\.)', r'\1api.\2', api_url) -            info = json.loads(self._download_webpage(api_url, event_name, -                                                     u'Downloading event info')) +            config_json = self._search_regex(r'window.config = ({.*?});', +                webpage, u'window config') +            info = json.loads(config_json)['event']              videos = [self._extract_video_info(video_data['data'])                  for video_data in info['feed']['data'] if video_data['type'] == u'video']              return self.playlist_result(videos, info['id'], info['full_name']) @@ -58,3 +57,44 @@ class LivestreamIE(InfoExtractor):              info = json.loads(self._download_webpage(api_url, video_id,                                                       u'Downloading video info'))              return self._extract_video_info(info) + + +# The original version of Livestream uses a different system +class LivestreamOriginalIE(InfoExtractor): +    IE_NAME = u'livestream:original' +    _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)' +    _TEST = { +        u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', +        u'info_dict': { +            u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', +            u'ext': u'flv', +            u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', +        }, +        u'params': { +            # rtmp +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        user = mobj.group('user') +        api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id) + +        api_response = self._download_webpage(api_url, video_id) +        info = xml.etree.ElementTree.fromstring(api_response.encode('utf-8')) +        item = info.find('channel').find('item') +        ns = {'media': 'http://search.yahoo.com/mrss'} +        thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] +        # Remove the extension and number from the path (like 1.jpg) +        path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path') + +        return { +            'id': video_id, +            'title': item.find('title').text, +            'url': 'rtmp://extondemand.livestream.com/ondemand', +            'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path), +            'ext': 'flv', +            'thumbnail': thumbnail_url, +        } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 234b9e80f..91480ba87 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -20,7 +20,9 @@ class MetacafeIE(InfoExtractor):      _DISCLAIMER = 'http://www.metacafe.com/family_filter/'      _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'      IE_NAME = u'metacafe' -    _TESTS = [{ +    _TESTS = [ +    # Youtube video +    {          u"add_ie": ["Youtube"],          u"url":  u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",          u"file":  u"_aUehQsCQtM.mp4", @@ -32,15 +34,42 @@ class MetacafeIE(InfoExtractor):              u"uploader_id": u"PBS"          }      }, +    # Normal metacafe video +    { +        u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/', +        u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad', +        u'info_dict': { +            u'id': u'11121940', +            u'ext': u'mp4', +            u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4', +            u'uploader': u'ign', +            u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', +        }, +    }, +    # AnyClip video      {          u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",          u"file": u"an-dVVXnuY7Jh77J.mp4",          u"info_dict": {              u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",              u"uploader": u"anyclip", -            u"description": u"md5:38c711dd98f5bb87acf973d573442e67" -        } -    }] +            u"description": u"md5:38c711dd98f5bb87acf973d573442e67", +        }, +    }, +    # age-restricted video +    { +        u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/', +        u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09', +        u'info_dict': { +            u'id': u'5186653', +            u'ext': u'mp4', +            u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.', +            u'uploader': u'Dwayne Pipe', +            u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b', +            u'age_limit': 18, +        }, +    }, +    ]      def report_disclaimer(self): @@ -62,6 +91,7 @@ class MetacafeIE(InfoExtractor):              'submit': "Continue - I'm over 18",              }          request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) +        request.add_header('Content-Type', 'application/x-www-form-urlencoded')          try:              self.report_age_confirmation()              compat_urllib_request.urlopen(request).read() @@ -83,7 +113,12 @@ class MetacafeIE(InfoExtractor):          # Retrieve video webpage to extract further information          req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) -        req.headers['Cookie'] = 'flashVersion=0;' + +        # AnyClip videos require the flashversion cookie so that we get the link +        # to the mp4 file +        mobj_an = re.match(r'^an-(.*?)$', video_id) +        if mobj_an: +            req.headers['Cookie'] = 'flashVersion=0;'          webpage = self._download_webpage(req, video_id)          # Extract URL, uploader and title from webpage @@ -125,6 +160,11 @@ class MetacafeIE(InfoExtractor):                  r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',                  webpage, u'uploader nickname', fatal=False) +        if re.search(r'"contentRating":"restricted"', webpage) is not None: +            age_limit = 18 +        else: +            age_limit = 0 +          return {              '_type':    'video',              'id':       video_id, @@ -134,4 +174,5 @@ class MetacafeIE(InfoExtractor):              'upload_date':  None,              'title':    video_title,              'ext':      video_ext, +            'age_limit': age_limit,          } diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py new file mode 100644 index 000000000..b9430b09b --- /dev/null +++ b/youtube_dl/extractor/mofosex.py @@ -0,0 +1,49 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urllib_request, +    compat_urllib_parse, +) + +class MofosexIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>mofosex\.com/videos/(?P<videoid>[0-9]+)/.*?\.html)' +    _TEST = { +        u'url': u'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', +        u'file': u'5018.mp4', +        u'md5': u'1b2eb47ac33cc75d4a80e3026b613c5a', +        u'info_dict': { +            u"title": u"Japanese Teen Music Video", +            u"age_limit": 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('videoid') +        url = 'http://www.' + mobj.group('url') + +        req = compat_urllib_request.Request(url) +        req.add_header('Cookie', 'age_verified=1') +        webpage = self._download_webpage(req, video_id) + +        video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, u'title') +        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, u'video_url')) +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:] +        format = path.split('/')[5].split('_')[:2] +        format = "-".join(format) + +        age_limit = self._rta_search(webpage) + +        return { +            'id': video_id, +            'title': video_title, +            'url': video_url, +            'ext': extension, +            'format': format, +            'format_id': format, +            'age_limit': age_limit, +        } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e520e2bb4..3df7f9b85 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -26,6 +26,7 @@ class MTVIE(InfoExtractor):              },          },          { +            u'add_ie': ['Vevo'],              u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',              u'file': u'USCJY1331283.mp4',              u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', @@ -47,7 +48,7 @@ class MTVIE(InfoExtractor):      def _transform_rtmp_url(rtmp_video_url):          m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)          if not m: -            raise ExtractorError(u'Cannot transform RTMP url') +            return rtmp_video_url          base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'          return base + m.group('finalid') @@ -80,6 +81,8 @@ class MTVIE(InfoExtractor):          video_id = self._id_from_uri(uri)          self.report_extraction(video_id)          mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] +        # Remove the templates, like &device={device} +        mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', u'', mediagen_url)          if 'acceptMethods' not in mediagen_url:              mediagen_url += '&acceptMethods=fms'          mediagen_page = self._download_webpage(mediagen_url, video_id, diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py new file mode 100644 index 000000000..050f54a5a --- /dev/null +++ b/youtube_dl/extractor/myspace.py @@ -0,0 +1,48 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_str, +) + + +class MySpaceIE(InfoExtractor): +    _VALID_URL = r'https?://myspace\.com/([^/]+)/video/[^/]+/(?P<id>\d+)' + +    _TEST = { +        u'url': u'https://myspace.com/coldplay/video/viva-la-vida/100008689', +        u'info_dict': { +            u'id': u'100008689', +            u'ext': u'flv', +            u'title': u'Viva La Vida', +            u'description': u'The official Viva La Vida video, directed by Hype Williams', +            u'uploader': u'Coldplay', +            u'uploader_id': u'coldplay', +        }, +        u'params': { +            # rtmp download +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        context = json.loads(self._search_regex(r'context = ({.*?});', webpage, +            u'context')) +        video = context['video'] +        rtmp_url, play_path = video['streamUrl'].split(';', 1) + +        return { +            'id': compat_str(video['mediaId']), +            'title': video['title'], +            'url': rtmp_url, +            'play_path': play_path, +            'ext': 'flv', +            'description': video['description'], +            'thumbnail': video['imageUrl'], +            'uploader': video['artistName'], +            'uploader_id': video['artistUsername'], +        } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 5e2454f1b..75cf4bb9f 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -47,10 +47,10 @@ class PornHubIE(InfoExtractor):          formats = []          for video_url in video_urls: -            path = compat_urllib_parse_urlparse( video_url ).path -            extension = os.path.splitext( path )[1][1:] +            path = compat_urllib_parse_urlparse(video_url).path +            extension = os.path.splitext(path)[1][1:]              format = path.split('/')[5].split('_')[:2] -            format = "-".join( format ) +            format = "-".join(format)              formats.append({                  'url': video_url,                  'ext': extension, diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 994778e16..3bbda128e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -8,7 +8,9 @@ class RedTubeIE(InfoExtractor):      _TEST = {          u'url': u'http://www.redtube.com/66418',          u'file': u'66418.mp4', -        u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d', +        # md5 varies from time to time, as in +        # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295 +        #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',          u'info_dict': {              u"title": u"Sucked on a toilet",              u"age_limit": 18, diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py index 9ac7c3be8..2f238de35 100644 --- a/youtube_dl/extractor/rtlnow.py +++ b/youtube_dl/extractor/rtlnow.py @@ -63,18 +63,6 @@ class RTLnowIE(InfoExtractor):          },      },      { -        u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1', -        u'file': u'129679.flv', -        u'info_dict': { -            u'upload_date': u'20131016',  -            u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...', -            u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig', -        }, -        u'params': { -            u'skip_download': True, -        }, -    }, -    {          u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',          u'file': u'124903.flv',          u'info_dict': { diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py index 2cba53076..f5003c7f9 100644 --- a/youtube_dl/extractor/slashdot.py +++ b/youtube_dl/extractor/slashdot.py @@ -7,6 +7,7 @@ class SlashdotIE(InfoExtractor):      _VALID_URL = r'https?://tv.slashdot.org/video/\?embed=(?P<id>.*?)(&|$)'      _TEST = { +        u'add_ie': ['Ooyala'],          u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',          u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',          u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 29cd5617c..83e1f055f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,17 +29,34 @@ class SoundcloudIE(InfoExtractor):                      )                      '''      IE_NAME = u'soundcloud' -    _TEST = { -        u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', -        u'file': u'62986583.mp3', -        u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', -        u'info_dict': { -            u"upload_date": u"20121011",  -            u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  -            u"uploader": u"E.T. ExTerrestrial Music",  -            u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" -        } -    } +    _TESTS = [ +        { +            u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', +            u'file': u'62986583.mp3', +            u'md5': u'ebef0a451b909710ed1d7787dddbf0d7', +            u'info_dict': { +                u"upload_date": u"20121011",  +                u"description": u"No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",  +                u"uploader": u"E.T. ExTerrestrial Music",  +                u"title": u"Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" +            } +        }, +        # not streamable song +        { +            u'url': u'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep', +            u'info_dict': { +                u'id': u'47127627', +                u'ext': u'mp3', +                u'title': u'Goldrushed', +                u'uploader': u'The Royal Concept', +                u'upload_date': u'20120521', +            }, +            u'params': { +                # rtmp +                u'skip_download': True, +            }, +        }, +    ]      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' @@ -56,24 +73,39 @@ class SoundcloudIE(InfoExtractor):          return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID      def _extract_info_dict(self, info, full_title=None, quiet=False): -        video_id = info['id'] -        name = full_title or video_id +        track_id = compat_str(info['id']) +        name = full_title or track_id          if quiet == False:              self.report_extraction(name)          thumbnail = info['artwork_url']          if thumbnail is not None:              thumbnail = thumbnail.replace('-large', '-t500x500') -        return { -            'id':       info['id'], +        result = { +            'id':       track_id,              'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,              'uploader': info['user']['username'],              'upload_date': unified_strdate(info['created_at']),              'title':    info['title'], -            'ext':      u'mp3', +            'ext':      info.get('original_format', u'mp3'),              'description': info['description'],              'thumbnail': thumbnail,          } +        if info.get('downloadable', False): +            result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID) +        if not info.get('streamable', False): +            # We have to get the rtmp url +            stream_json = self._download_webpage( +                'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID), +                track_id, u'Downloading track url') +            rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url'] +            # The url doesn't have an rtmp app, we have to extract the playpath +            url, path = rtmp_url.split('mp3:', 1) +            result.update({ +                'url': url, +                'play_path': 'mp3:' + path, +            }) +        return result      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) @@ -106,70 +138,8 @@ class SoundcloudIE(InfoExtractor):  class SoundcloudSetIE(SoundcloudIE):      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'      IE_NAME = u'soundcloud:set' -    _TEST = { -        u"url":"https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep", -        u"playlist": [ -            { -                u"file":"30510138.mp3", -                u"md5":"f9136bf103901728f29e419d2c70f55d", -                u"info_dict": { -                    u"upload_date": u"20111213", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"D-D-Dance" -                } -            }, -            { -                u"file":"47127625.mp3", -                u"md5":"09b6758a018470570f8fd423c9453dd8", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"The Royal Concept - Gimme Twice" -                } -            }, -            { -                u"file":"47127627.mp3", -                u"md5":"154abd4e418cea19c3b901f1e1306d9c", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Goldrushed" -                } -            }, -            { -                u"file":"47127629.mp3", -                u"md5":"2f5471edc79ad3f33a683153e96a79c1", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"In the End" -                } -            }, -            { -                u"file":"47127631.mp3", -                u"md5":"f9ba87aa940af7213f98949254f1c6e2", -                u"info_dict": { -                    u"upload_date": u"20120521", -                    u"description": u"The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", -                    u"uploader": u"The Royal Concept", -                    u"title": u"Knocked Up" -                } -            }, -            { -                u"file":"75206121.mp3", -                u"md5":"f9d1fe9406717e302980c30de4af9353", -                u"info_dict": { -                    u"upload_date": u"20130116", -                    u"description": u"The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central).  \r\nAs a gift to our fans we would like to offer you a free download of the track!  ", -                    u"uploader": u"The Royal Concept", -                    u"title": u"World On Fire" -                } -            } -        ] -    } +    # it's in tests/test_playlists.py +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) @@ -208,7 +178,7 @@ class SoundcloudUserIE(SoundcloudIE):      IE_NAME = u'soundcloud:user'      # it's in tests/test_playlists.py -    _TEST = None +    _TESTS = []      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py index b1e96b679..a711531e6 100644 --- a/youtube_dl/extractor/southparkstudios.py +++ b/youtube_dl/extractor/southparkstudios.py @@ -5,21 +5,19 @@ from .mtv import MTVIE, _media_xml_tag  class SouthParkStudiosIE(MTVIE):      IE_NAME = u'southparkstudios.com' -    _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$)' +    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'      _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' -    _TEST = { +    # Overwrite MTVIE properties we don't want +    _TESTS = [{          u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',          u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',          u'info_dict': {              u'title': u'Bat Daded',              u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',          }, -    } - -    # Overwrite MTVIE properties we don't want -    _TESTS = [] +    }]      def _get_thumbnail_url(self, uri, itemdoc):          search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) @@ -31,8 +29,23 @@ class SouthParkStudiosIE(MTVIE):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) +        url = u'http://www.' + mobj.group(u'url')          video_id = mobj.group('id')          webpage = self._download_webpage(url, video_id)          mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',                                    webpage, u'mgid')          return self._get_videos_info(mgid) + +class SouthparkDeIE(SouthParkStudiosIE): +    IE_NAME = u'southpark.de' +    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))' +    _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/' + +    _TESTS = [{ +        u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured', +        u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4', +        u'info_dict': { +            u'title': u'The Government Won\'t Respect My Privacy', +            u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', +        }, +    }] diff --git a/youtube_dl/extractor/space.py b/youtube_dl/extractor/space.py new file mode 100644 index 000000000..0d32a0688 --- /dev/null +++ b/youtube_dl/extractor/space.py @@ -0,0 +1,35 @@ +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveIE +from ..utils import RegexNotFoundError, ExtractorError + + +class SpaceIE(InfoExtractor): +    _VALID_URL = r'https?://www\.space\.com/\d+-(?P<title>[^/\.\?]*?)-video.html' +    _TEST = { +        u'add_ie': ['Brightcove'], +        u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html', +        u'info_dict': { +            u'id': u'2780937028001', +            u'ext': u'mp4', +            u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video', +            u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61', +            u'uploader': u'TechMedia Networks', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        title = mobj.group('title') +        webpage = self._download_webpage(url, title) +        try: +            # Some videos require the playerKey field, which isn't define in +            # the BrightcoveExperience object +            brightcove_url = self._og_search_video_url(webpage) +        except RegexNotFoundError: +            # Other videos works fine with the info from the object +            brightcove_url = BrightcoveIE._extract_brightcove_url(webpage) +        if brightcove_url is None: +            raise ExtractorError(u'The webpage does not contain a video', expected=True) +        return self.url_result(brightcove_url, BrightcoveIE.ie_key()) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 32df0a7fb..97f9c268a 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -49,10 +49,10 @@ class SpankwireIE(InfoExtractor):          formats = []          for video_url in video_urls: -            path = compat_urllib_parse_urlparse( video_url ).path -            extension = os.path.splitext( path )[1][1:] +            path = compat_urllib_parse_urlparse(video_url).path +            extension = os.path.splitext(path)[1][1:]              format = path.split('/')[4].split('_')[:2] -            format = "-".join( format ) +            format = "-".join(format)              formats.append({                  'url': video_url,                  'ext': extension, diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 13c86401c..6dc2eda6d 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -2,18 +2,27 @@ import re  import xml.etree.ElementTree  from .common import InfoExtractor +from ..utils import determine_ext  class SpiegelIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' -    _TEST = { +    _TESTS = [{          u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',          u'file': u'1259285.mp4',          u'md5': u'2c2754212136f35fb4b19767d242f66e',          u'info_dict': {              u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv"          } -    } +    }, +    { +        u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', +        u'file': u'1309159.mp4', +        u'md5': u'f2cdf638d7aa47654e251e1aee360af1', +        u'info_dict': { +            u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers' +        } +    }]      def _real_extract(self, url):          m = re.match(self._VALID_URL, url) @@ -21,25 +30,38 @@ class SpiegelIE(InfoExtractor):          webpage = self._download_webpage(url, video_id) -        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>', -            webpage, u'title') +        video_title = self._html_search_regex( +            r'<div class="module-title">(.*?)</div>', webpage, u'title')          xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' -        xml_code = self._download_webpage(xml_url, video_id, -                    note=u'Downloading XML', errnote=u'Failed to download XML') +        xml_code = self._download_webpage( +            xml_url, video_id, +            note=u'Downloading XML', errnote=u'Failed to download XML')          idoc = xml.etree.ElementTree.fromstring(xml_code) -        last_type = idoc[-1] -        filename = last_type.findall('./filename')[0].text -        duration = float(last_type.findall('./duration')[0].text) -        video_url = 'http://video2.spiegel.de/flash/' + filename -        video_ext = filename.rpartition('.')[2] +        formats = [ +            { +                'format_id': n.tag.rpartition('type')[2], +                'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text, +                'width': int(n.find('./width').text), +                'height': int(n.find('./height').text), +                'abr': int(n.find('./audiobitrate').text), +                'vbr': int(n.find('./videobitrate').text), +                'vcodec': n.find('./codec').text, +                'acodec': 'MP4A', +            } +            for n in list(idoc) +            # Blacklist type 6, it's extremely LQ and not available on the same server +            if n.tag.startswith('type') and n.tag != 'type6' +        ] +        formats.sort(key=lambda f: f['vbr']) +        duration = float(idoc[0].findall('./duration')[0].text) +          info = {              'id': video_id, -            'url': video_url, -            'ext': video_ext,              'title': video_title,              'duration': duration, +            'formats': formats,          } -        return [info] +        return info diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py index 90de7de3a..4b4c5235d 100644 --- a/youtube_dl/extractor/subtitles.py +++ b/youtube_dl/extractor/subtitles.py @@ -12,9 +12,9 @@ class SubtitlesInfoExtractor(InfoExtractor):          return any([self._downloader.params.get('writesubtitles', False),                      self._downloader.params.get('writeautomaticsub')]) -    def _list_available_subtitles(self, video_id, webpage=None): +    def _list_available_subtitles(self, video_id, webpage):          """ outputs the available subtitles for the video """ -        sub_lang_list = self._get_available_subtitles(video_id) +        sub_lang_list = self._get_available_subtitles(video_id, webpage)          auto_captions_list = self._get_available_automatic_caption(video_id, webpage)          sub_lang = ",".join(list(sub_lang_list.keys()))          self.to_screen(u'%s: Available subtitles for video: %s' % @@ -23,7 +23,7 @@ class SubtitlesInfoExtractor(InfoExtractor):          self.to_screen(u'%s: Available automatic captions for video: %s' %                         (video_id, auto_lang)) -    def extract_subtitles(self, video_id, video_webpage=None): +    def extract_subtitles(self, video_id, webpage):          """          returns {sub_lang: sub} ,{} if subtitles not found or None if the          subtitles aren't requested. @@ -32,9 +32,9 @@ class SubtitlesInfoExtractor(InfoExtractor):              return None          available_subs_list = {}          if self._downloader.params.get('writeautomaticsub', False): -            available_subs_list.update(self._get_available_automatic_caption(video_id, video_webpage)) +            available_subs_list.update(self._get_available_automatic_caption(video_id, webpage))          if self._downloader.params.get('writesubtitles', False): -            available_subs_list.update(self._get_available_subtitles(video_id)) +            available_subs_list.update(self._get_available_subtitles(video_id, webpage))          if not available_subs_list:  # error, it didn't get the available subtitles              return {} @@ -74,7 +74,7 @@ class SubtitlesInfoExtractor(InfoExtractor):              return          return sub -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          """          returns {sub_lang: url} or {} if not available          Must be redefined by the subclasses diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index c910110ca..bc48620f0 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -1,4 +1,5 @@  import re +import xml.etree.ElementTree  from .common import InfoExtractor  from ..utils import ( @@ -11,7 +12,7 @@ class TeamcocoIE(InfoExtractor):      _TEST = {          u'url': u'http://teamcoco.com/video/louis-ck-interview-george-w-bush',          u'file': u'19705.mp4', -        u'md5': u'27b6f7527da5acf534b15f21b032656e', +        u'md5': u'cde9ba0fa3506f5f017ce11ead928f9a',          u'info_dict': {              u"description": u"Louis C.K. got starstruck by George W. Bush, so what? Part one.",               u"title": u"Louis C.K. Interview Pt. 1 11/3/11" @@ -31,16 +32,40 @@ class TeamcocoIE(InfoExtractor):          self.report_extraction(video_id)          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id -        data = self._download_webpage(data_url, video_id, 'Downloading data webpage') +        data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage') +        data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8')) -        video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>', -            data, u'video URL') -        return [{ +        qualities = ['500k', '480p', '1000k', '720p', '1080p'] +        formats = [] +        for file in data.findall('files/file'): +            if file.attrib.get('playmode') == 'all': +                # it just duplicates one of the entries +                break +            file_url = file.text +            m_format = re.search(r'(\d+(k|p))\.mp4', file_url) +            if m_format is not None: +                format_id = m_format.group(1) +            else: +                format_id = file.attrib['bitrate'] +            formats.append({ +                'url': file_url, +                'ext': 'mp4', +                'format_id': format_id, +            }) +        def sort_key(f): +            try: +                return qualities.index(f['format_id']) +            except ValueError: +                return -1 +        formats.sort(key=sort_key) +        if not formats: +            raise RegexNotFoundError(u'Unable to extract video URL') + +        return {              'id':          video_id, -            'url':         video_url, -            'ext':         'mp4', +            'formats': formats,              'title':       self._og_search_title(webpage),              'thumbnail':   self._og_search_thumbnail(webpage),              'description': self._og_search_description(webpage), -        }] +        } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index dfa1176a3..2e497c86e 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -1,10 +1,14 @@  import json  import re -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( +    compat_str, +    RegexNotFoundError, +) -class TEDIE(InfoExtractor): +class TEDIE(SubtitlesInfoExtractor):      _VALID_URL=r'''http://www\.ted\.com/                     (                          ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist @@ -32,33 +36,32 @@ class TEDIE(InfoExtractor):      def _real_extract(self, url):          m=re.match(self._VALID_URL, url, re.VERBOSE)          if m.group('type_talk'): -            return [self._talk_info(url)] +            return self._talk_info(url)          else :              playlist_id=m.group('playlist_id')              name=m.group('name')              self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))              return [self._playlist_videos_info(url,name,playlist_id)] -    def _playlist_videos_info(self,url,name,playlist_id=0): + +    def _playlist_videos_info(self, url, name, playlist_id):          '''Returns the videos of the playlist''' -        video_RE=r''' -                     <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)" -                     ([.\s]*?)data-playlist_item_id="(\d+)" -                     ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)" -                     ''' -        video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>' -        webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') -        m_videos=re.finditer(video_RE,webpage,re.VERBOSE) -        m_names=re.finditer(video_name_RE,webpage) + +        webpage = self._download_webpage( +            url, playlist_id, u'Downloading playlist webpage') +        matches = re.finditer( +            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>', +            webpage)          playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',                                                   webpage, 'playlist title') -        playlist_entries = [] -        for m_video, m_name in zip(m_videos,m_names): -            talk_url='http://www.ted.com%s' % m_name.group('talk_url') -            playlist_entries.append(self.url_result(talk_url, 'TED')) -        return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) +        playlist_entries = [ +            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED') +            for m in matches +        ] +        return self.playlist_result( +            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)      def _talk_info(self, url, video_id=0):          """Return the video for the talk in the url""" @@ -81,16 +84,35 @@ class TEDIE(InfoExtractor):              'ext': 'mp4',              'url': stream['file'],              'format': stream['id'] -            } for stream in info['htmlStreams']] -        info = { -            'id': info['id'], +        } for stream in info['htmlStreams']] + +        video_id = info['id'] + +        # subtitles +        video_subtitles = self.extract_subtitles(video_id, webpage) +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, webpage) +            return + +        return { +            'id': video_id,              'title': title,              'thumbnail': thumbnail,              'description': desc, +            'subtitles': video_subtitles,              'formats': formats,          } -        # TODO: Remove when #980 has been merged -        info.update(info['formats'][-1]) - -        return info +    def _get_available_subtitles(self, video_id, webpage): +        try: +            options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL) +            languages = re.findall(r'(?:<option value=")(\S+)"', options) +            if languages: +                sub_lang_list = {} +                for l in languages: +                    url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l) +                    sub_lang_list[l] = url +                return sub_lang_list +        except RegexNotFoundError as err: +            self._downloader.report_warning(u'video doesn\'t have subtitles') +        return {} diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index aea9d9a24..d4b7603c7 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -46,10 +46,10 @@ class Tube8IE(InfoExtractor):          if webpage.find('"encrypted":true')!=-1:              password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password')              video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8') -        path = compat_urllib_parse_urlparse( video_url ).path -        extension = os.path.splitext( path )[1][1:] +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:]          format = path.split('/')[4].split('_')[:2] -        format = "-".join( format ) +        format = "-".join(format)          return {              'id': video_id, diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py new file mode 100644 index 000000000..bfed9dd04 --- /dev/null +++ b/youtube_dl/extractor/tvp.py @@ -0,0 +1,42 @@ +import json +import re + +from .common import InfoExtractor + + +class TvpIE(InfoExtractor): +    IE_NAME = u'tvp.pl' +    _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P<date>\d+)/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238', +        u'md5': u'148408967a6a468953c0a75cbdaf0d7a', +        u'file': u'12878238.wmv', +        u'info_dict': { +            u'title': u'31.10.2013 - Odcinek 2', +            u'description': u'31.10.2013 - Odcinek 2', +        }, +        u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.' +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id +        json_params = self._download_webpage( +            json_url, video_id, u"Downloading video metadata") + +        params = json.loads(json_params) +        self.report_extraction(video_id) +        video_url = params['video_url'] + +        title = self._og_search_title(webpage, fatal=True) +        return { +            'id': video_id, +            'title': title, +            'ext': 'wmv', +            'url': video_url, +            'description': self._og_search_description(webpage), +            'thumbnail': self._og_search_thumbnail(webpage), +        } diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 1c1cc418d..4378b1780 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,7 @@ import datetime  from .common import InfoExtractor  from ..utils import ( -    determine_ext, +    compat_HTTPError,      ExtractorError,  ) @@ -16,26 +16,22 @@ class VevoIE(InfoExtractor):      (currently used by MTVIE)      """      _VALID_URL = r'((http://www.vevo.com/watch/.*?/.*?/)|(vevo:))(?P<id>.*?)(\?|$)' -    _TEST = { +    _TESTS = [{          u'url': u'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',          u'file': u'GB1101300280.mp4', +        u"md5": u"06bea460acb744eab74a9d7dcb4bfd61",          u'info_dict': {              u"upload_date": u"20130624",              u"uploader": u"Hurts",              u"title": u"Somebody to Die For", -            u'duration': 230, +            u"duration": 230, +            u"width": 1920, +            u"height": 1080,          } -    } +    }] +    _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') - -        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id -        info_json = self._download_webpage(json_url, video_id, u'Downloading json info') - -        self.report_extraction(video_id) -        video_info = json.loads(info_json)['video'] +    def _formats_from_json(self, video_info):          last_version = {'version': -1}          for version in video_info['videoVersions']:              # These are the HTTP downloads, other types are for different manifests @@ -50,17 +46,75 @@ class VevoIE(InfoExtractor):          # Already sorted from worst to best quality          for rend in renditions.findall('rendition'):              attr = rend.attrib -            f_url = attr['url'] +            format_note = '%(videoCodec)s@%(videoBitrate)4sk, %(audioCodec)s@%(audioBitrate)3sk' % attr              formats.append({ -                'url': f_url, -                'ext': determine_ext(f_url), +                'url': attr['url'], +                'format_id': attr['name'], +                'format_note': format_note,                  'height': int(attr['frameheight']),                  'width': int(attr['frameWidth']),              }) +        return formats + +    def _formats_from_smil(self, smil_xml): +        formats = [] +        smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) +        els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') +        for el in els: +            src = el.attrib['src'] +            m = re.match(r'''(?xi) +                (?P<ext>[a-z0-9]+): +                (?P<path> +                    [/a-z0-9]+     # The directory and main part of the URL +                    _(?P<cbr>[0-9]+)k +                    _(?P<width>[0-9]+)x(?P<height>[0-9]+) +                    _(?P<vcodec>[a-z0-9]+) +                    _(?P<vbr>[0-9]+) +                    _(?P<acodec>[a-z0-9]+) +                    _(?P<abr>[0-9]+) +                    \.[a-z0-9]+  # File extension +                )''', src) +            if not m: +                continue -        date_epoch = int(self._search_regex( -            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date'))/1000 -        upload_date = datetime.datetime.fromtimestamp(date_epoch) +            format_url = self._SMIL_BASE_URL + m.group('path') +            formats.append({ +                'url': format_url, +                'format_id': u'SMIL_' + m.group('cbr'), +                'vcodec': m.group('vcodec'), +                'acodec': m.group('acodec'), +                'vbr': int(m.group('vbr')), +                'abr': int(m.group('abr')), +                'ext': m.group('ext'), +                'width': int(m.group('width')), +                'height': int(m.group('height')), +            }) +        return formats + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id +        info_json = self._download_webpage(json_url, video_id, u'Downloading json info') +        video_info = json.loads(info_json)['video'] + +        formats = self._formats_from_json(video_info) +        try: +            smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( +                self._SMIL_BASE_URL, video_id, video_id.lower()) +            smil_xml = self._download_webpage(smil_url, video_id, +                                              u'Downloading SMIL info') +            formats.extend(self._formats_from_smil(smil_xml)) +        except ExtractorError as ee: +            if not isinstance(ee.cause, compat_HTTPError): +                raise +            self._downloader.report_warning( +                u'Cannot download SMIL information, falling back to JSON ..') + +        timestamp_ms = int(self._search_regex( +            r'/Date\((\d+)\)/', video_info['launchDate'], u'launch date')) +        upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)          info = {              'id': video_id,              'title': video_info['title'], @@ -71,7 +125,4 @@ class VevoIE(InfoExtractor):              'duration': video_info['duration'],          } -        # TODO: Remove when #980 has been merged -        info.update(formats[-1]) -          return info diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py index 12c84a985..826804af3 100644 --- a/youtube_dl/extractor/viddler.py +++ b/youtube_dl/extractor/viddler.py @@ -8,7 +8,7 @@ from ..utils import (  class ViddlerIE(InfoExtractor): -    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[0-9]+)' +    _VALID_URL = r'(?P<domain>https?://(?:www\.)?viddler.com)/(?:v|embed|player)/(?P<id>[a-z0-9]+)'      _TEST = {          u"url": u"http://www.viddler.com/v/43903784",          u'file': u'43903784.mp4', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index b4dbcd2ee..d465bf20b 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -20,14 +20,14 @@ class VimeoIE(InfoExtractor):      """Information extractor for vimeo.com."""      # _VALID_URL matches Vimeo URLs -    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$' +    _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|(?P<player>player))\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)/?(?:[?].*)?(?:#.*)?$'      _NETRC_MACHINE = 'vimeo'      IE_NAME = u'vimeo'      _TESTS = [          {              u'url': u'http://vimeo.com/56015672#at=0',              u'file': u'56015672.mp4', -            u'md5': u'ae7a1d8b183758a0506b0622f37dfa14', +            u'md5': u'8879b6cc097e987f02484baf890129e5',              u'info_dict': {                  u"upload_date": u"20121220",                   u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",  @@ -128,11 +128,9 @@ class VimeoIE(InfoExtractor):              raise ExtractorError(u'Invalid URL: %s' % url)          video_id = mobj.group('id') -        if not mobj.group('proto'): -            url = 'https://' + url -        elif mobj.group('pro'): +        if mobj.group('pro') or mobj.group('player'):              url = 'http://player.vimeo.com/video/' + video_id -        elif mobj.group('direct_link'): +        else:              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information @@ -205,7 +203,7 @@ class VimeoIE(InfoExtractor):          # Vimeo specific: extract video codec and quality information          # First consider quality, then codecs, then take everything          codecs = [('vp6', 'flv'), ('vp8', 'flv'), ('h264', 'mp4')] -        files = { 'hd': [], 'sd': [], 'other': []} +        files = {'hd': [], 'sd': [], 'other': []}          config_files = config["video"].get("files") or config["request"].get("files")          for codec_name, codec_extension in codecs:              for quality in config_files.get(codec_name, []): @@ -234,7 +232,7 @@ class VimeoIE(InfoExtractor):          if len(formats) == 0:              raise ExtractorError(u'No known codec found') -        return [{ +        return {              'id':       video_id,              'uploader': video_uploader,              'uploader_id': video_uploader_id, @@ -243,7 +241,8 @@ class VimeoIE(InfoExtractor):              'thumbnail':    video_thumbnail,              'description':  video_description,              'formats': formats, -        }] +            'webpage_url': url, +        }  class VimeoChannelIE(InfoExtractor): diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c4ec1f06f..651ba317d 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,7 +27,7 @@ class VineIE(InfoExtractor):          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',              webpage, u'video URL') -        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>', +        uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',              webpage, u'uploader', fatal=False, flags=re.DOTALL)          return [{ diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py new file mode 100644 index 000000000..90d8a6d07 --- /dev/null +++ b/youtube_dl/extractor/vk.py @@ -0,0 +1,45 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_str, +    unescapeHTML, +) + + +class VKIE(InfoExtractor): +    IE_NAME = u'vk.com' +    _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)' + +    _TEST = { +        u'url': u'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', +        u'md5': u'0deae91935c54e00003c2a00646315f0', +        u'info_dict': { +            u'id': u'162222515', +            u'ext': u'flv', +            u'title': u'ProtivoGunz - Хуёвая песня', +            u'uploader': u'Noize MC', +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id +        info_page = self._download_webpage(info_url, video_id) +        m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) +        if m_yt is not None: +            self.to_screen(u'Youtube video detected') +            return self.url_result(m_yt.group(1), 'Youtube') +        vars_json = self._search_regex(r'var vars = ({.*?});', info_page, u'vars') +        vars = json.loads(vars_json) + +        return { +            'id': compat_str(vars['vid']), +            'url': vars['url240'], +            'title': unescapeHTML(vars['md_title']), +            'thumbnail': vars['jpg'], +            'uploader': vars['md_author'], +        } diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0757495bd..fa784ab99 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -13,6 +13,7 @@ class WeiboIE(InfoExtractor):      _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'      _TEST = { +        u'add_ie': ['Sina'],          u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',          u'file': u'98322879.flv',          u'info_dict': { diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 8a0eb1afd..1177a4b14 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -9,7 +9,7 @@ from ..utils import (  class XNXXIE(InfoExtractor): -    _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' +    _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'      VIDEO_URL_RE = r'flv_url=(.*?)&'      VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'      VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py new file mode 100644 index 000000000..03ad88bed --- /dev/null +++ b/youtube_dl/extractor/xtube.py @@ -0,0 +1,55 @@ +import os +import re + +from .common import InfoExtractor +from ..utils import ( +    compat_urllib_parse_urlparse, +    compat_urllib_request, +    compat_urllib_parse, +) + +class XTubeIE(InfoExtractor): +    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' +    _TEST = { +        u'url': u'http://www.xtube.com/watch.php?v=kVTUy_G222_', +        u'file': u'kVTUy_G222_.mp4', +        u'md5': u'092fbdd3cbe292c920ef6fc6a8a9cdab', +        u'info_dict': { +            u"title": u"strange erotica", +            u"description": u"surreal gay themed erotica...almost an ET kind of thing", +            u"uploader": u"greenshowers", +            u"age_limit": 18, +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('videoid') +        url = 'http://www.' + mobj.group('url') + +        req = compat_urllib_request.Request(url) +        req.add_header('Cookie', 'age_verified=1') +        webpage = self._download_webpage(req, video_id) + +        video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, u'title') +        video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, u'uploader', fatal=False) +        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, u'description', default=None) +        video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, u'video_url').replace('\\/', '/') +        path = compat_urllib_parse_urlparse(video_url).path +        extension = os.path.splitext(path)[1][1:] +        format = path.split('/')[5].split('_')[:2] +        format[0] += 'p' +        format[1] += 'k' +        format = "-".join(format) + +        return { +            'id': video_id, +            'title': video_title, +            'uploader': video_uploader, +            'description': video_description, +            'url': video_url, +            'ext': extension, +            'format': format, +            'format_id': format, +            'age_limit': 18, +        } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 464b498f5..34e6afb20 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -132,7 +132,7 @@ class YahooSearchIE(SearchInfoExtractor):                  mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)                  e = self.url_result('http://' + mobj.group('url'), 'Yahoo')                  res['entries'].append(e) -            if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )): +            if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):                  break          return res diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 9d88c17f5..a8fd40c83 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -18,7 +18,7 @@ class YoukuIE(InfoExtractor):          u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",          u"file": u"XNDgyMDQ2NTQw_part00.flv",          u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b", -        u"params": { u"test": False }, +        u"params": {u"test": False},          u"info_dict": {              u"title": u"youtube-dl test video \"'/\\ä↭𝕐"          } @@ -37,8 +37,8 @@ class YoukuIE(InfoExtractor):          source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")          seed = float(seed)          for i in range(len(source)): -            seed  =  (seed * 211 + 30031 ) % 65536 -            index  =  math.floor(seed / 65536 * len(source) ) +            seed  =  (seed * 211 + 30031) % 65536 +            index  =  math.floor(seed / 65536 * len(source))              mixed.append(source[int(index)])              source.remove(source[int(index)])          #return ''.join(mixed) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index e46a9b4d6..bd0f2cae0 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -81,14 +81,14 @@ class YouPornIE(InfoExtractor):              # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0              # A path looks like this:              # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 -            video_url = unescapeHTML( link ) -            path = compat_urllib_parse_urlparse( video_url ).path -            extension = os.path.splitext( path )[1][1:] +            video_url = unescapeHTML(link) +            path = compat_urllib_parse_urlparse(video_url).path +            extension = os.path.splitext(path)[1][1:]              format = path.split('/')[4].split('_')[:2]              # size = format[0]              # bitrate = format[1] -            format = "-".join( format ) +            format = "-".join(format)              # title = u'%s-%s-%s' % (video_title, size, bitrate)              formats.append({ diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d05d0a8c1..1aa549740 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -74,14 +74,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):              self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))              return False -        galx = None -        dsh = None -        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page) -        if match: -          galx = match.group(1) -        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page) -        if match: -          dsh = match.group(1) +        galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"', +                                  login_page, u'Login GALX parameter')          # Log in          login_form_strs = { @@ -95,7 +89,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):                  u'checkConnection': u'',                  u'checkedDomains': u'youtube',                  u'dnConn': u'', -                u'dsh': dsh,                  u'pstMsg': u'0',                  u'rmShown': u'1',                  u'secTok': u'', @@ -347,18 +340,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              }          },          { -            u"url":  u"http://www.youtube.com/watch?v=1ltcDfZMA3U", -            u"file":  u"1ltcDfZMA3U.mp4", -            u"note": u"Test VEVO video (#897)", -            u"info_dict": { -                u"upload_date": u"20070518", -                u"title": u"Maps - It Will Find You", -                u"description": u"Music video by Maps performing It Will Find You.", -                u"uploader": u"MuteUSA", -                u"uploader_id": u"MuteUSA" -            } -        }, -        {              u"url":  u"http://www.youtube.com/watch?v=UxxajLWwzqY",              u"file":  u"UxxajLWwzqY.mp4",              u"note": u"Test generic use_cipher_signature video (#897)", @@ -1038,6 +1019,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          """Turn the encrypted s field into a working signature"""          if player_url is not None: +            if player_url.startswith(u'//'): +                player_url = u'https:' + player_url              try:                  player_id = (player_url, len(s))                  if player_id not in self._player_cache: @@ -1101,7 +1084,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          else:              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) -    def _get_available_subtitles(self, video_id): +    def _get_available_subtitles(self, video_id, webpage):          try:              sub_list = self._download_webpage(                  'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1117,8 +1100,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              params = compat_urllib_parse.urlencode({                  'lang': lang,                  'v': video_id, -                'fmt': self._downloader.params.get('subtitlesformat'), -                'name': l[0], +                'fmt': self._downloader.params.get('subtitlesformat', 'srt'), +                'name': l[0].encode('utf-8'),              })              url = u'http://www.youtube.com/api/timedtext?' + params              sub_lang_list[lang] = url @@ -1130,7 +1113,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):      def _get_available_automatic_caption(self, video_id, webpage):          """We need the webpage for getting the captions url, pass it as an             argument to speed up the process.""" -        sub_format = self._downloader.params.get('subtitlesformat') +        sub_format = self._downloader.params.get('subtitlesformat', 'srt')          self.to_screen(u'%s: Looking for automatic captions' % video_id)          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)          err_msg = u'Couldn\'t find automatic captions for %s' % video_id @@ -1318,6 +1301,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              else:                  raise ExtractorError(u'"token" parameter not in video info for unknown reason') +        if 'view_count' in video_info: +            view_count = int(video_info['view_count'][0]) +        else: +            view_count = None +          # Check for "rental" videos          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:              raise ExtractorError(u'"rental" videos not supported') @@ -1504,7 +1492,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'subtitles':    video_subtitles,                  'duration':     video_duration,                  'age_limit':    18 if age_gate else 0, -                'annotations':  video_annotations +                'annotations':  video_annotations, +                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, +                'view_count': view_count,              })          return results @@ -1590,7 +1580,6 @@ class YoutubePlaylistIE(InfoExtractor):  class YoutubeChannelIE(InfoExtractor):      IE_DESC = u'YouTube.com channels'      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" -    _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'      _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'      IE_NAME = u'youtube:channel' @@ -1611,29 +1600,30 @@ class YoutubeChannelIE(InfoExtractor):          # Download channel page          channel_id = mobj.group(1)          video_ids = [] -        pagenum = 1 - -        url = self._TEMPLATE_URL % (channel_id, pagenum) -        page = self._download_webpage(url, channel_id, -                                      u'Downloading page #%s' % pagenum) - -        # Extract video identifiers -        ids_in_page = self.extract_videos_from_page(page) -        video_ids.extend(ids_in_page) +        url = 'https://www.youtube.com/channel/%s/videos' % channel_id +        channel_page = self._download_webpage(url, channel_id) +        if re.search(r'channel-header-autogenerated-label', channel_page) is not None: +            autogenerated = True +        else: +            autogenerated = False -        # Download any subsequent channel pages using the json-based channel_ajax query -        if self._MORE_PAGES_INDICATOR in page: +        if autogenerated: +            # The videos are contained in a single page +            # the ajax pages can't be used, they are empty +            video_ids = self.extract_videos_from_page(channel_page) +        else: +            # Download all channel pages using the json-based channel_ajax query              for pagenum in itertools.count(1):                  url = self._MORE_PAGES_URL % (pagenum, channel_id)                  page = self._download_webpage(url, channel_id,                                                u'Downloading page #%s' % pagenum) - +                      page = json.loads(page) - +                      ids_in_page = self.extract_videos_from_page(page['content_html'])                  video_ids.extend(ids_in_page) - -                if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']: +     +                if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:                      break          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) @@ -1750,6 +1740,10 @@ class YoutubeSearchIE(SearchInfoExtractor):          videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]          return self.playlist_result(videos, query) +class YoutubeSearchDateIE(YoutubeSearchIE): +    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' +    _SEARCH_KEY = 'ytsearchdate' +    IE_DESC = u'YouTube.com searches, newest videos first'  class YoutubeShowIE(InfoExtractor):      IE_DESC = u'YouTube.com (multi-season) shows' diff --git a/youtube_dl/update.py b/youtube_dl/update.py index 0689a4891..f41b4785a 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -2,11 +2,15 @@ import io  import json  import traceback  import hashlib +import os  import subprocess  import sys  from zipimport import zipimporter -from .utils import * +from .utils import ( +    compat_str, +    compat_urllib_request, +)  from .version import __version__  def rsa_verify(message, signature, key): diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 048afc8e7..110058c79 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.28' +__version__ = '2013.11.17' | 
