diff options
40 files changed, 1136 insertions, 240 deletions
| @@ -13,13 +13,13 @@ PYTHON=/usr/bin/env python  # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local  ifeq ($(PREFIX),/usr) -    SYSCONFDIR=/etc +	SYSCONFDIR=/etc  else -    ifeq ($(PREFIX),/usr/local) -        SYSCONFDIR=/etc -    else -        SYSCONFDIR=$(PREFIX)/etc -    endif +	ifeq ($(PREFIX),/usr/local) +		SYSCONFDIR=/etc +	else +		SYSCONFDIR=$(PREFIX)/etc +	endif  endif  install: youtube-dl youtube-dl.1 youtube-dl.bash-completion @@ -71,6 +71,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-  		--exclude '*~' \  		--exclude '__pycache' \  		--exclude '.git' \ +		--exclude 'testdata' \  		-- \  		bin devscripts test youtube_dl \  		CHANGELOG LICENSE README.md README.txt \ @@ -57,9 +57,10 @@ which means you can modify it, redistribute it or use it however you like.                                 file. Record all downloaded videos in it.  ## Download Options: -    -r, --rate-limit LIMIT     maximum download rate (e.g. 50k or 44.6m) +    -r, --rate-limit LIMIT     maximum download rate in bytes per second (e.g. +                               50K or 4.2M)      -R, --retries RETRIES      number of retries (default is 10) -    --buffer-size SIZE         size of download buffer (e.g. 1024 or 16k) +    --buffer-size SIZE         size of download buffer (e.g. 1024 or 16K)                                 (default is 1024)      --no-resize-buffer         do not automatically adjust the buffer size. By                                 default, the buffer size is automatically resized @@ -100,6 +101,7 @@ which means you can modify it, redistribute it or use it however you like.                                 file modification time      --write-description        write video description to a .description file      --write-info-json          write video metadata to a .info.json file +    --write-annotations        write video annotations to a .annotation file      --write-thumbnail          write thumbnail image to disk  ## Verbosity / Simulation Options: @@ -166,6 +168,7 @@ which means you can modify it, redistribute it or use it however you like.                                 processed files are overwritten by default      --embed-subs               embed subtitles in the video (only for mp4                                 videos) +    --add-metadata             add metadata to the files  # CONFIGURATION diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/test/__init__.py diff --git a/test/helper.py b/test/helper.py index ad1b74dd3..79a0ede48 100644 --- a/test/helper.py +++ b/test/helper.py @@ -1,22 +1,27 @@  import errno  import io +import hashlib  import json  import os.path  import re  import types  import youtube_dl.extractor -from youtube_dl import YoutubeDL, YoutubeDLHandler -from youtube_dl.utils import ( -    compat_cookiejar, -    compat_urllib_request, -) +from youtube_dl import YoutubeDL -youtube_dl._setup_opener(timeout=10) -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: -    parameters = json.load(pf) +def global_setup(): +    youtube_dl._setup_opener(timeout=10) + + +def get_params(override=None): +    PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), +                                   "parameters.json") +    with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: +        parameters = json.load(pf) +    if override: +        parameters.update(override) +    return parameters  def try_rm(filename): @@ -32,7 +37,7 @@ class FakeYDL(YoutubeDL):      def __init__(self):          # Different instances of the downloader can't share the same dictionary          # some test set the "sublang" parameter, which would break the md5 checks. -        params = dict(parameters) +        params = get_params()          super(FakeYDL, self).__init__(params)          self.result = [] @@ -62,3 +67,6 @@ def get_testcases():          for t in getattr(ie, '_TESTS', []):              t['name'] = type(ie).__name__[:-len('IE')]              yield t + + +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index 943f9a315..d500c6edc 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -1,14 +1,16 @@  #!/usr/bin/env python +# Allow direct execution +import os  import sys  import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import global_setup, try_rm +global_setup() -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  from youtube_dl import YoutubeDL -from helper import try_rm  def _download_restricted(url, filename, age): diff --git a/test/test_all_urls.py b/test/test_all_urls.py index ff1c86efe..56e5f80e1 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -1,14 +1,20 @@  #!/usr/bin/env python +# Allow direct execution +import os  import sys  import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE, gen_extractors -from helper import get_testcases +from test.helper import get_testcases + +from youtube_dl.extractor import ( +    gen_extractors, +    JustinTVIE, +    YoutubeIE, +) +  class TestAllURLsMatching(unittest.TestCase):      def setUp(self): diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py index ed2ad311d..c596415c4 100644 --- a/test/test_dailymotion_subtitles.py +++ b/test/test_dailymotion_subtitles.py @@ -1,18 +1,16 @@  #!/usr/bin/env python +# Allow direct execution +import os  import sys  import unittest -import hashlib +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import FakeYDL, global_setup, md5 +global_setup() -from youtube_dl.extractor import DailymotionIE -from youtube_dl.utils import * -from helper import FakeYDL -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() +from youtube_dl.extractor import DailymotionIE  class TestDailymotionSubtitles(unittest.TestCase):      def setUp(self): diff --git a/test/test_download.py b/test/test_download.py index fdf59bb5c..b9a9be11d 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -1,26 +1,31 @@  #!/usr/bin/env python +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, get_testcases, global_setup, try_rm, md5 +global_setup() + +  import hashlib  import io -import os  import json -import unittest -import sys  import socket -import binascii - -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  import youtube_dl.YoutubeDL -from youtube_dl.utils import * - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") +from youtube_dl.utils import ( +    compat_str, +    compat_urllib_error, +    DownloadError, +    ExtractorError, +    UnavailableVideoError, +)  RETRIES = 3 -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -  class YoutubeDL(youtube_dl.YoutubeDL):      def __init__(self, *args, **kwargs):          self.to_stderr = self.to_screen @@ -37,18 +42,12 @@ def _file_md5(fn):      with open(fn, 'rb') as f:          return hashlib.md5(f.read()).hexdigest() -import helper  # Set up remaining global configuration -from helper import get_testcases, try_rm  defs = get_testcases() -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: -    parameters = json.load(pf) -  class TestDownload(unittest.TestCase):      maxDiff = None      def setUp(self): -        self.parameters = parameters          self.defs = defs  ### Dynamically generate tests @@ -68,8 +67,7 @@ def generator(test_case):              print_skipping(test_case['skip'])              return -        params = self.parameters.copy() -        params.update(test_case.get('params', {})) +        params = get_params(test_case.get('params', {}))          ydl = YoutubeDL(params)          ydl.add_default_info_extractors() diff --git a/test/test_playlists.py b/test/test_playlists.py index de8bd298a..d6a8d56df 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -1,13 +1,16 @@  #!/usr/bin/env python  # encoding: utf-8 -import sys -import unittest -import json  # Allow direct execution  import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup +global_setup() +  from youtube_dl.extractor import (      DailymotionPlaylistIE, @@ -18,9 +21,7 @@ from youtube_dl.extractor import (      LivestreamIE,      NHLVideocenterIE,  ) -from youtube_dl.utils import * -from helper import FakeYDL  class TestPlaylists(unittest.TestCase):      def assertIsPlaylist(self, info): diff --git a/test/test_utils.py b/test/test_utils.py index ff2e9885b..f3fbff042 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1,14 +1,15 @@  #!/usr/bin/env python +# coding: utf-8 -# Various small unit tests - +# Allow direct execution +import os  import sys  import unittest -import xml.etree.ElementTree +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Various small unit tests +import xml.etree.ElementTree  #from youtube_dl.utils import htmlentity_transform  from youtube_dl.utils import ( @@ -20,6 +21,9 @@ from youtube_dl.utils import (      unified_strdate,      find_xpath_attr,      get_meta_content, +    xpath_with_ns, +    smuggle_url, +    unsmuggle_url,  )  if sys.version_info < (3, 0): @@ -141,5 +145,31 @@ class TestUtil(unittest.TestCase):          self.assertEqual(get_meta('description'), u'foo & bar')          self.assertEqual(get_meta('author'), 'Plato') +    def test_xpath_with_ns(self): +        testxml = u'''<root xmlns:media="http://example.com/"> +            <media:song> +                <media:author>The Author</media:author> +                <url>http://server.com/download.mp3</url> +            </media:song> +        </root>''' +        doc = xml.etree.ElementTree.fromstring(testxml) +        find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) +        self.assertTrue(find('media:song') is not None) +        self.assertEqual(find('media:song/media:author').text, u'The Author') +        self.assertEqual(find('media:song/url').text, u'http://server.com/download.mp3') + +    def test_smuggle_url(self): +        data = {u"ö": u"ö", u"abc": [3]} +        url = 'https://foo.bar/baz?x=y#a' +        smug_url = smuggle_url(url, data) +        unsmug_url, unsmug_data = unsmuggle_url(smug_url) +        self.assertEqual(url, unsmug_url) +        self.assertEqual(data, unsmug_data) + +        res_url, res_data = unsmuggle_url(url) +        self.assertEqual(res_url, url) +        self.assertEqual(res_data, None) + +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py new file mode 100644 index 000000000..35defb895 --- /dev/null +++ b/test/test_write_annotations.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import get_params, global_setup, try_rm +global_setup() + + +import io + +import xml.etree.ElementTree + +import youtube_dl.YoutubeDL +import youtube_dl.extractor + + +class YoutubeDL(youtube_dl.YoutubeDL): +    def __init__(self, *args, **kwargs): +        super(YoutubeDL, self).__init__(*args, **kwargs) +        self.to_stderr = self.to_screen + +params = get_params({ +    'writeannotations': True, +    'skip_download': True, +    'writeinfojson': False, +    'format': 'flv', +}) + + + +TEST_ID = 'gr51aVj-mLg' +ANNOTATIONS_FILE = TEST_ID + '.flv.annotations.xml' +EXPECTED_ANNOTATIONS = ['Speech bubble', 'Note', 'Title', 'Spotlight', 'Label'] + +class TestAnnotations(unittest.TestCase): +    def setUp(self): +        # Clear old files +        self.tearDown() + + +    def test_info_json(self): +        expected = list(EXPECTED_ANNOTATIONS) #Two annotations could have the same text. +        ie = youtube_dl.extractor.YoutubeIE() +        ydl = YoutubeDL(params) +        ydl.add_info_extractor(ie) +        ydl.download([TEST_ID]) +        self.assertTrue(os.path.exists(ANNOTATIONS_FILE)) +        annoxml = None +        with io.open(ANNOTATIONS_FILE, 'r', encoding='utf-8') as annof: +                annoxml = xml.etree.ElementTree.parse(annof) +        self.assertTrue(annoxml is not None, 'Failed to parse annotations XML') +        root = annoxml.getroot() +        self.assertEqual(root.tag, 'document') +        annotationsTag = root.find('annotations') +        self.assertEqual(annotationsTag.tag, 'annotations') +        annotations = annotationsTag.findall('annotation') + +        #Not all the annotations have TEXT children and the annotations are returned unsorted. +        for a in annotations: +                self.assertEqual(a.tag, 'annotation') +                if a.get('type') == 'text': +                        textTag = a.find('TEXT') +                        text = textTag.text +                        self.assertTrue(text in expected) #assertIn only added in python 2.7 +                        #remove the first occurance, there could be more than one annotation with the same text +                        expected.remove(text) +        #We should have seen (and removed) all the expected annotation texts. +        self.assertEqual(len(expected), 0, 'Not all expected annotations were found.') +         + +    def tearDown(self): +        try_rm(ANNOTATIONS_FILE) + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py index de6d5180f..a5b6f6972 100644 --- a/test/test_write_info_json.py +++ b/test/test_write_info_json.py @@ -1,37 +1,34 @@  #!/usr/bin/env python  # coding: utf-8 -import json +# Allow direct execution  import os  import sys  import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import get_params, global_setup +global_setup() + + +import io +import json  import youtube_dl.YoutubeDL  import youtube_dl.extractor -from youtube_dl.utils import * - -PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") -# General configuration (from __init__, not very elegant...) -jar = compat_cookiejar.CookieJar() -cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) -proxy_handler = compat_urllib_request.ProxyHandler() -opener = compat_urllib_request.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) -compat_urllib_request.install_opener(opener)  class YoutubeDL(youtube_dl.YoutubeDL):      def __init__(self, *args, **kwargs):          super(YoutubeDL, self).__init__(*args, **kwargs)          self.to_stderr = self.to_screen -with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: -    params = json.load(pf) -params['writeinfojson'] = True -params['skip_download'] = True -params['writedescription'] = True +params = get_params({ +    'writeinfojson': True, +    'skip_download': True, +    'writedescription': True, +}) +  TEST_ID = 'BaW_jenozKc'  INFO_JSON_FILE = TEST_ID + '.mp4.info.json' @@ -42,6 +39,7 @@ This is a test video for youtube-dl.  For more information, contact phihag@phihag.de .''' +  class TestInfoJSON(unittest.TestCase):      def setUp(self):          # Clear old files diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 53e65816d..4b7a7847b 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -1,20 +1,26 @@  #!/usr/bin/env python +# Allow direct execution +import os  import sys  import unittest -import json +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup +global_setup() -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.extractor import YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, YoutubeShowIE -from youtube_dl.utils import * +from youtube_dl.extractor import ( +    YoutubeUserIE, +    YoutubePlaylistIE, +    YoutubeIE, +    YoutubeChannelIE, +    YoutubeShowIE, +) -from helper import FakeYDL  class TestYoutubeLists(unittest.TestCase): -    def assertIsPlaylist(self,info): +    def assertIsPlaylist(self, info):          """Make sure the info has '_type' set to 'playlist'"""          self.assertEqual(info['_type'], 'playlist') @@ -100,7 +106,7 @@ class TestYoutubeLists(unittest.TestCase):          dl = FakeYDL()          ie = YoutubeShowIE(dl)          result = ie.extract('http://www.youtube.com/show/airdisasters') -        self.assertTrue(len(result) >= 4) +        self.assertTrue(len(result) >= 3)  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 5007d9a16..5e1ff5eb0 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -1,14 +1,18 @@  #!/usr/bin/env python -import io -import re -import string +# Allow direct execution +import os  import sys  import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import global_setup +global_setup() + + +import io +import re +import string  from youtube_dl.extractor import YoutubeIE  from youtube_dl.utils import compat_str, compat_urlretrieve diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index f9b0c1ad0..00430a338 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -1,69 +1,79 @@  #!/usr/bin/env python +# Allow direct execution +import os  import sys  import unittest -import hashlib +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL, global_setup, md5 +global_setup() -# Allow direct execution -import os -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  from youtube_dl.extractor import YoutubeIE -from youtube_dl.utils import * -from helper import FakeYDL -md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()  class TestYoutubeSubtitles(unittest.TestCase):      def setUp(self):          self.DL = FakeYDL()          self.url = 'QRS8MkLhQmM' +      def getInfoDict(self):          IE = YoutubeIE(self.DL)          info_dict = IE.extract(self.url)          return info_dict +      def getSubtitles(self):          info_dict = self.getInfoDict() -        return info_dict[0]['subtitles']         +        return info_dict[0]['subtitles'] +      def test_youtube_no_writesubtitles(self):          self.DL.params['writesubtitles'] = False          subtitles = self.getSubtitles()          self.assertEqual(subtitles, None) +      def test_youtube_subtitles(self):          self.DL.params['writesubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') +      def test_youtube_subtitles_lang(self):          self.DL.params['writesubtitles'] = True          self.DL.params['subtitleslangs'] = ['it']          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') +      def test_youtube_allsubtitles(self):          self.DL.params['writesubtitles'] = True          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles.keys()), 13) +      def test_youtube_subtitles_sbv_format(self):          self.DL.params['writesubtitles'] = True          self.DL.params['subtitlesformat'] = 'sbv'          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') +      def test_youtube_subtitles_vtt_format(self):          self.DL.params['writesubtitles'] = True          self.DL.params['subtitlesformat'] = 'vtt'          subtitles = self.getSubtitles()          self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7') +      def test_youtube_list_subtitles(self):          self.DL.expect_warning(u'Video doesn\'t have automatic captions')          self.DL.params['listsubtitles'] = True          info_dict = self.getInfoDict()          self.assertEqual(info_dict, None) +      def test_youtube_automatic_captions(self):          self.url = '8YoUxe5ncPo'          self.DL.params['writeautomaticsub'] = True          self.DL.params['subtitleslangs'] = ['it']          subtitles = self.getSubtitles()          self.assertTrue(subtitles['it'] is not None) +      def test_youtube_nosubtitles(self):          self.DL.expect_warning(u'video doesn\'t have subtitles')          self.url = 'sAjKT8FhjI8' @@ -71,6 +81,7 @@ class TestYoutubeSubtitles(unittest.TestCase):          self.DL.params['allsubtitles'] = True          subtitles = self.getSubtitles()          self.assertEqual(len(subtitles), 0) +      def test_youtube_multiple_langs(self):          self.url = 'QRS8MkLhQmM'          self.DL.params['writesubtitles'] = True @@ -1,5 +1,8 @@  [tox]  envlist = py26,py27,py33  [testenv] -deps = nose -commands = nosetests --with-coverage --cover-package=youtube_dl --cover-html --verbose test +deps = +   nose +   coverage +commands = nosetests --verbose {posargs:test}  # --with-coverage --cover-package=youtube_dl --cover-html +                                               # test.test_download:TestDownload.test_NowVideo diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index fbf8a7f98..13b56ede5 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -2,9 +2,15 @@ import os  import subprocess  import sys  import time -import datetime -from .utils import * + +from .utils import ( +    compat_subprocess_get_DEVNULL, +    encodeFilename, +    PostProcessingError, +    shell_quote, +    subtitles_filename, +)  class PostProcessor(object): @@ -83,6 +89,8 @@ class FFmpegPostProcessor(PostProcessor):                 + opts +                 [encodeFilename(self._ffmpeg_filename_argument(out_path))]) +        if self._downloader.params.get('verbose', False): +            self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))          p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)          stdout,stderr = p.communicate()          if p.returncode != 0: @@ -178,7 +186,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):              extension = self._preferredcodec              more_opts = []              if self._preferredquality is not None: -                if int(self._preferredquality) < 10: +                # The opus codec doesn't support the -aq option +                if int(self._preferredquality) < 10 and extension != 'opus':                      more_opts += [self._exes['avconv'] and '-q:a' or '-aq', self._preferredquality]                  else:                      more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality + 'k'] diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a32e50772..f22a8bd0e 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -71,6 +71,7 @@ class YoutubeDL(object):      logtostderr:       Log messages to stderr instead of stdout.      writedescription:  Write the video description to a .description file      writeinfojson:     Write the video description to a .info.json file +    writeannotations:  Write the video annotations to a .annotations.xml file      writethumbnail:    Write the thumbnail image to a file      writesubtitles:    Write the video subtitles to a file      writeautomaticsub: Write the automatic subtitles to a file @@ -258,6 +259,10 @@ class YoutubeDL(object):          """ Report that the metadata file has been written """          self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) +    def report_writeannotations(self, annofn): +        """ Report that the annotations file has been written. """ +        self.to_screen(u'[info] Writing video annotations to: ' + annofn) +      def report_file_already_downloaded(self, file_name):          """Report file has already been fully downloaded."""          try: @@ -599,6 +604,18 @@ class YoutubeDL(object):                  self.report_error(u'Cannot write description file ' + descfn)                  return +        if self.params.get('writeannotations', False): +            try: +               annofn = filename + u'.annotations.xml' +               self.report_writeannotations(annofn) +               with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: +                   annofile.write(info_dict['annotations']) +            except (KeyError, TypeError): +                self.report_warning(u'There are no annotations to write.') +            except (OSError, IOError): +                 self.report_error(u'Cannot write annotations file: ' + annofn) +                 return +          subtitles_are_requested = any([self.params.get('writesubtitles', False),                                         self.params.get('writeautomaticsub')]) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index bc8e97250..cd642ce3b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -31,6 +31,7 @@ __authors__  = (      'Huarong Huo',      'Ismael Mejía',      'Steffan \'Ruirize\' James', +    'Andras Elso',  )  __license__ = 'Public Domain' @@ -46,17 +47,43 @@ import shlex  import socket  import subprocess  import sys -import warnings +import traceback  import platform -from .utils import * +from .utils import ( +    compat_cookiejar, +    compat_print, +    compat_str, +    compat_urllib_request, +    DateRange, +    decodeOption, +    determine_ext, +    DownloadError, +    get_cachedir, +    make_HTTPS_handler, +    MaxDownloadsReached, +    platform_name, +    preferredencoding, +    SameFileError, +    std_headers, +    write_string, +    YoutubeDLHandler, +)  from .update import update_self  from .version import __version__ -from .FileDownloader import * +from .FileDownloader import ( +    FileDownloader, +)  from .extractor import gen_extractors  from .YoutubeDL import YoutubeDL -from .PostProcessor import * +from .PostProcessor import ( +    FFmpegMetadataPP, +    FFmpegVideoConvertor, +    FFmpegExtractAudioPP, +    FFmpegEmbedSubtitlePP, +) +  def parseOpts(overrideArguments=None):      def _readOptions(filename_bytes): @@ -240,11 +267,11 @@ def parseOpts(overrideArguments=None):              help='languages of the subtitles to download (optional) separated by commas, use IETF language tags like \'en,pt\'')      downloader.add_option('-r', '--rate-limit', -            dest='ratelimit', metavar='LIMIT', help='maximum download rate (e.g. 50k or 44.6m)') +            dest='ratelimit', metavar='LIMIT', help='maximum download rate in bytes per second (e.g. 50K or 4.2M)')      downloader.add_option('-R', '--retries',              dest='retries', metavar='RETRIES', help='number of retries (default is %default)', default=10)      downloader.add_option('--buffer-size', -            dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16k) (default is %default)', default="1024") +            dest='buffersize', metavar='SIZE', help='size of download buffer (e.g. 1024 or 16K) (default is %default)', default="1024")      downloader.add_option('--no-resize-buffer',              action='store_true', dest='noresizebuffer',              help='do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.', default=False) @@ -339,6 +366,9 @@ def parseOpts(overrideArguments=None):      filesystem.add_option('--write-info-json',              action='store_true', dest='writeinfojson',              help='write video metadata to a .info.json file', default=False) +    filesystem.add_option('--write-annotations', +            action='store_true', dest='writeannotations', +            help='write video annotations to a .annotation file', default=False)      filesystem.add_option('--write-thumbnail',              action='store_true', dest='writethumbnail',              help='write thumbnail image to disk', default=False) @@ -601,6 +631,7 @@ def _real_main(argv=None):          'nopart': opts.nopart,          'updatetime': opts.updatetime,          'writedescription': opts.writedescription, +        'writeannotations': opts.writeannotations,          'writeinfojson': opts.writeinfojson,          'writethumbnail': opts.writethumbnail,          'writesubtitles': opts.writesubtitles, @@ -684,7 +715,7 @@ def _real_main(argv=None):      if opts.cookiefile is not None:          try:              jar.save() -        except (IOError, OSError) as err: +        except (IOError, OSError):              sys.exit(u'ERROR: unable to save cookie jar')      sys.exit(retcode) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 688196869..db69af361 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -2,7 +2,12 @@ from .appletrailers import AppleTrailersIE  from .addanime import AddAnimeIE  from .archiveorg import ArchiveOrgIE  from .ard import ARDIE -from .arte import ArteTvIE +from .arte import ( +    ArteTvIE, +    ArteTVPlus7IE, +    ArteTVCreativeIE, +    ArteTVFutureIE, +)  from .auengine import AUEngineIE  from .bandcamp import BandcampIE  from .bliptv import BlipTVIE, BlipTVUserIE @@ -12,6 +17,7 @@ from .brightcove import BrightcoveIE  from .c56 import C56IE  from .canalplus import CanalplusIE  from .canalc2 import Canalc2IE +from .cinemassacre import CinemassacreIE  from .cnn import CNNIE  from .collegehumor import CollegeHumorIE  from .comedycentral import ComedyCentralIE @@ -61,6 +67,7 @@ from .ign import IGNIE, OneUPIE  from .ina import InaIE  from .infoq import InfoQIE  from .instagram import InstagramIE +from .internetvideoarchive import InternetVideoArchiveIE  from .jeuxvideo import JeuxVideoIE  from .jukebox import JukeboxIE  from .justintv import JustinTVIE @@ -82,6 +89,7 @@ from .nba import NBAIE  from .nbc import NBCNewsIE  from .newgrounds import NewgroundsIE  from .nhl import NHLIE, NHLVideocenterIE +from .nowvideo import NowVideoIE  from .ooyala import OoyalaIE  from .orf import ORFIE  from .pbs import PBSIE @@ -91,8 +99,10 @@ from .rbmaradio import RBMARadioIE  from .redtube import RedTubeIE  from .ringtv import RingTVIE  from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE  from .roxwel import RoxwelIE  from .rtlnow import RTLnowIE +from .rutube import RutubeIE  from .sina import SinaIE  from .slashdot import SlashdotIE  from .slideshare import SlideshareIE @@ -103,7 +113,9 @@ from .spiegel import SpiegelIE  from .stanfordoc import StanfordOpenClassroomIE  from .statigram import StatigramIE  from .steam import SteamIE +from .sztvhu import SztvHuIE  from .teamcoco import TeamcocoIE +from .techtalks import TechTalksIE  from .ted import TEDIE  from .tf1 import TF1IE  from .thisav import ThisAVIE @@ -120,10 +132,13 @@ from .veoh import VeohIE  from .vevo import VevoIE  from .vice import ViceIE  from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE  from .videofyme import VideofyMeIE +from .videopremium import VideoPremiumIE  from .vimeo import VimeoIE, VimeoChannelIE  from .vine import VineIE  from .wat import WatIE +from .websurg import WeBSurgIE  from .weibo import WeiboIE  from .wimp import WimpIE  from .worldstarhiphop import WorldStarHipHopIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 4707d7cca..5ee8a67b1 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,3 +1,4 @@ +# encoding: utf-8  import re  import json  import xml.etree.ElementTree @@ -7,15 +8,15 @@ from ..utils import (      ExtractorError,      find_xpath_attr,      unified_strdate, +    determine_ext, +    get_element_by_id,  ) +# There are different sources of video in arte.tv, the extraction process  +# is different for each one. The videos usually expire in 7 days, so we can't +# add tests. +  class ArteTvIE(InfoExtractor): -    """ -    There are two sources of video in arte.tv: videos.arte.tv and -    www.arte.tv/guide, the extraction process is different for each one. -    The videos expire in 7 days, so we can't add tests. -    """ -    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'      _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'      _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'      _LIVE_URL = r'index-[0-9]+\.html$' @@ -24,7 +25,7 @@ class ArteTvIE(InfoExtractor):      @classmethod      def suitable(cls, url): -        return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL, cls._LIVEWEB_URL)) +        return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))      # TODO implement Live Stream      # from ..utils import compat_urllib_parse @@ -55,14 +56,6 @@ class ArteTvIE(InfoExtractor):      #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))      def _real_extract(self, url): -        mobj = re.match(self._EMISSION_URL, url) -        if mobj is not None: -            lang = mobj.group('lang') -            # This is not a real id, it can be for example AJT for the news -            # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal -            video_id = mobj.group('id') -            return self._extract_emission(url, video_id, lang) -          mobj = re.match(self._VIDEOS_URL, url)          if mobj is not None:              id = mobj.group('id') @@ -80,59 +73,6 @@ class ArteTvIE(InfoExtractor):              # self.extractLiveStream(url)              # return -    def _extract_emission(self, url, video_id, lang): -        """Extract from www.arte.tv/guide""" -        webpage = self._download_webpage(url, video_id) -        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') - -        json_info = self._download_webpage(json_url, video_id, 'Downloading info json') -        self.report_extraction(video_id) -        info = json.loads(json_info) -        player_info = info['videoJsonPlayer'] - -        info_dict = {'id': player_info['VID'], -                     'title': player_info['VTI'], -                     'description': player_info.get('VDE'), -                     'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]), -                     'thumbnail': player_info['programImage'], -                     'ext': 'flv', -                     } - -        formats = player_info['VSR'].values() -        def _match_lang(f): -            # Return true if that format is in the language of the url -            if lang == 'fr': -                l = 'F' -            elif lang == 'de': -                l = 'A' -            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] -            return any(re.match(r, f['versionCode']) for r in regexes) -        # Some formats may not be in the same language as the url -        formats = filter(_match_lang, formats) -        # Some formats use the m3u8 protocol -        formats = filter(lambda f: f['videoFormat'] != 'M3U8', formats) -        # We order the formats by quality -        formats = sorted(formats, key=lambda f: int(f['height'])) -        # Prefer videos without subtitles in the same language -        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f['versionCode']) is None) -        # Pick the best quality -        def _format(format_info): -            info = {'ext': 'flv', -                    'width': format_info.get('width'), -                    'height': format_info.get('height'), -                    } -            if format_info['mediaType'] == u'rtmp': -                info['url'] = format_info['streamer'] -                info['play_path'] = 'mp4:' + format_info['url'] -            else: -                info_dict['url'] = format_info['url'] -            return info -        info_dict['formats'] = [_format(f) for f in formats] -        # TODO: Remove when #980 has been merged  -        info_dict.update(info_dict['formats'][-1]) - -        return info_dict -      def _extract_video(self, url, video_id, lang):          """Extract from videos.arte.tv"""          ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') @@ -182,3 +122,110 @@ class ArteTvIE(InfoExtractor):                  'ext': 'flv',                  'thumbnail': self._og_search_thumbnail(webpage),                  } + + +class ArteTVPlus7IE(InfoExtractor): +    IE_NAME = u'arte.tv:+7' +    _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' + +    @classmethod +    def _extract_url_info(cls, url): +        mobj = re.match(cls._VALID_URL, url) +        lang = mobj.group('lang') +        # This is not a real id, it can be for example AJT for the news +        # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal +        video_id = mobj.group('id') +        return video_id, lang + +    def _real_extract(self, url): +        video_id, lang = self._extract_url_info(url) +        webpage = self._download_webpage(url, video_id) +        return self._extract_from_webpage(webpage, video_id, lang) + +    def _extract_from_webpage(self, webpage, video_id, lang): +        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + +        json_info = self._download_webpage(json_url, video_id, 'Downloading info json') +        self.report_extraction(video_id) +        info = json.loads(json_info) +        player_info = info['videoJsonPlayer'] + +        info_dict = { +            'id': player_info['VID'], +            'title': player_info['VTI'], +            'description': player_info.get('VDE'), +            'upload_date': unified_strdate(player_info.get('VDA', '').split(' ')[0]), +            'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), +        } + +        formats = player_info['VSR'].values() +        def _match_lang(f): +            if f.get('versionCode') is None: +                return True +            # Return true if that format is in the language of the url +            if lang == 'fr': +                l = 'F' +            elif lang == 'de': +                l = 'A' +            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] +            return any(re.match(r, f['versionCode']) for r in regexes) +        # Some formats may not be in the same language as the url +        formats = filter(_match_lang, formats) +        # Some formats use the m3u8 protocol +        formats = filter(lambda f: f.get('videoFormat') != 'M3U8', formats) +        # We order the formats by quality +        formats = sorted(formats, key=lambda f: int(f.get('height',-1))) +        # Prefer videos without subtitles in the same language +        formats = sorted(formats, key=lambda f: re.match(r'VO(F|A)-STM\1', f.get('versionCode', '')) is None) +        # Pick the best quality +        def _format(format_info): +            info = { +                'width': format_info.get('width'), +                'height': format_info.get('height'), +            } +            if format_info['mediaType'] == u'rtmp': +                info['url'] = format_info['streamer'] +                info['play_path'] = 'mp4:' + format_info['url'] +                info['ext'] = 'flv' +            else: +                info['url'] = format_info['url'] +                info['ext'] = determine_ext(info['url']) +            return info +        info_dict['formats'] = [_format(f) for f in formats] +        # TODO: Remove when #980 has been merged  +        info_dict.update(info_dict['formats'][-1]) + +        return info_dict + + +# It also uses the arte_vp_url url from the webpage to extract the information +class ArteTVCreativeIE(ArteTVPlus7IE): +    IE_NAME = u'arte.tv:creative' +    _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de)/magazine?/(?P<id>.+)' + +    _TEST = { +        u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', +        u'file': u'050489-002.mp4', +        u'info_dict': { +            u'title': u'Agentur Amateur #2 - Corporate Design', +        }, +    } + + +class ArteTVFutureIE(ArteTVPlus7IE): +    IE_NAME = u'arte.tv:future' +    _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de)/(thema|sujet)/.*?#article-anchor-(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', +        u'file': u'050940-003.mp4', +        u'info_dict': { +            u'title': u'Les champignons au secours de la planète', +        }, +    } + +    def _real_extract(self, url): +        anchor_id, lang = self._extract_url_info(url) +        webpage = self._download_webpage(url, anchor_id) +        row = get_element_by_id(anchor_id, webpage) +        return self._extract_from_webpage(row, anchor_id, lang) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 745212f2f..1392f382a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -53,6 +53,8 @@ class BrightcoveIE(InfoExtractor):          # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553          object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',                              lambda m: m.group(1) + '/>', object_str) +        # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 +        object_str = object_str.replace(u'<--', u'<!--')          object_doc = xml.etree.ElementTree.fromstring(object_str)          assert u'BrightcoveExperience' in object_doc.attrib['class'] @@ -96,7 +98,10 @@ class BrightcoveIE(InfoExtractor):          playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,                                                 player_key, u'Downloading playlist information') -        playlist_info = json.loads(playlist_info)['videoList'] +        json_data = json.loads(playlist_info) +        if 'videoList' not in json_data: +            raise ExtractorError(u'Empty playlist') +        playlist_info = json_data['videoList']          videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]          return self.playlist_result(videos, playlist_id=playlist_info['id'], diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py new file mode 100644 index 000000000..6925b96c2 --- /dev/null +++ b/youtube_dl/extractor/cinemassacre.py @@ -0,0 +1,91 @@ +# encoding: utf-8 +import re + +from .common import InfoExtractor +from ..utils import ( +    ExtractorError, +) + + +class CinemassacreIE(InfoExtractor): +    _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?' +    _TESTS = [{ +        u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', +        u'file': u'19911.flv', +        u'info_dict': { +            u'upload_date': u'20121110', +            u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', +            u'description': u'md5:fb87405fcb42a331742a0dce2708560b', +        }, +        u'params': { +            # rtmp download +            u'skip_download': True, +        }, +    }, +    { +        u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', +        u'file': u'521be8ef82b16.flv', +        u'info_dict': { +            u'upload_date': u'20131002', +            u'title': u'The Mummy’s Hand (1940)', +        }, +        u'params': { +            # rtmp download +            u'skip_download': True, +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        webpage_url = u'http://' + mobj.group('url') +        webpage = self._download_webpage(webpage_url, None) # Don't know video id yet +        video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') +        mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/(?:embed|player)\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage) +        if not mobj: +            raise ExtractorError(u'Can\'t extract embed url and video id') +        playerdata_url = mobj.group(u'embed_url') +        video_id = mobj.group(u'video_id') + +        video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|', +            webpage, u'title') +        video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', +            webpage, u'description', flags=re.DOTALL, fatal=False) +        if len(video_description) == 0: +            video_description = None + +        playerdata = self._download_webpage(playerdata_url, video_id) +        base_url = self._html_search_regex(r'\'streamer\': \'(?P<base_url>rtmp://.*?)/(?:vod|Cinemassacre)\'', +            playerdata, u'base_url') +        base_url += '/Cinemassacre/' +        # Important: The file names in playerdata are not used by the player and even wrong for some videos +        sd_file = 'Cinemassacre-%s_high.mp4' % video_id +        hd_file = 'Cinemassacre-%s.mp4' % video_id +        video_thumbnail = 'http://image.screenwavemedia.com/Cinemassacre/Cinemassacre-%s_thumb_640x360.jpg' % video_id + +        formats = [ +            { +                'url': base_url + sd_file, +                'ext': 'flv', +                'format': 'sd', +                'format_id': 'sd', +            }, +            { +                'url': base_url + hd_file, +                'ext': 'flv', +                'format': 'hd', +                'format_id': 'hd', +            }, +        ] + +        info = { +            'id': video_id, +            'title': video_title, +            'formats': formats, +            'description': video_description, +            'upload_date': video_date, +            'thumbnail': video_thumbnail, +        } +        # TODO: Remove when #980 has been merged +        info.update(formats[-1]) +        return info diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 5edbf678a..098768361 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,56 +1,59 @@  import re -import xml.etree.ElementTree +import json  from .common import InfoExtractor  from ..utils import ( -    unified_strdate,      compat_urllib_parse, +    compat_urlparse, +    unescapeHTML, +    get_meta_content,  ) +  class GameSpotIE(InfoExtractor): -    _WORKING = False      _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'      _TEST = {          u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", -        u"file": u"6410818.mp4", +        u"file": u"gs-2300-6410818.mp4",          u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",          u"info_dict": {              u"title": u"Arma 3 - Community Guide: SITREP I", -            u"upload_date": u"20130627",  +            u'description': u'Check out this video where some of the basics of Arma 3 is explained.',          }      } -      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        page_id = mobj.group('page_id') +        page_id = video_id = mobj.group('page_id')          webpage = self._download_webpage(url, page_id) -        video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"', -                                            r'http://www\.gamespot\.com/videoembed/(\d+)'], -                                           webpage, 'video id') -        data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'}) -        info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data -        info_xml = self._download_webpage(info_url, video_id) -        doc = xml.etree.ElementTree.fromstring(info_xml) -        clip_el = doc.find('./playList/clip') +        data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video') +        data_video = json.loads(unescapeHTML(data_video_json)) -        http_urls = [{'url': node.find('filePath').text, -                      'rate': int(node.find('rate').text)} -            for node in clip_el.find('./httpURI')] -        best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1] -        video_url = best_quality['url'] -        title = clip_el.find('./title').text -        ext = video_url.rpartition('.')[2] -        thumbnail_url = clip_el.find('./screenGrabURI').text -        view_count = int(clip_el.find('./views').text) -        upload_date = unified_strdate(clip_el.find('./postDate').text) +        # Transform the manifest url to a link to the mp4 files +        # they are used in mobile devices. +        f4m_url = data_video['videoStreams']['f4m_stream'] +        f4m_path = compat_urlparse.urlparse(f4m_url).path +        QUALITIES_RE = r'((,\d+)+,?)' +        qualities = self._search_regex(QUALITIES_RE, f4m_path, u'qualities').strip(',').split(',') +        http_path = f4m_path[1:].split('/', 1)[1] +        http_template = re.sub(QUALITIES_RE, r'%s', http_path) +        http_template = http_template.replace('.csmil/manifest.f4m', '') +        http_template = compat_urlparse.urljoin('http://video.gamespotcdn.com/', http_template) +        formats = [] +        for q in qualities: +            formats.append({ +                'url': http_template % q, +                'ext': 'mp4', +                'format_id': q, +            }) -        return [{ -            'id'          : video_id, -            'url'         : video_url, -            'ext'         : ext, -            'title'       : title, -            'thumbnail'   : thumbnail_url, -            'upload_date' : upload_date, -            'view_count'  : view_count, -        }] +        info = { +            'id': data_video['guid'], +            'title': compat_urllib_parse.unquote(data_video['title']), +            'formats': formats, +            'description': get_meta_content('description', webpage), +            'thumbnail': self._og_search_thumbnail(webpage), +        } +        # TODO: Remove when #980 has been merged +        info.update(formats[-1]) +        return info diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7060c6f92..89805250c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,6 +11,8 @@ from ..utils import (      compat_urlparse,      ExtractorError, +    smuggle_url, +    unescapeHTML,  )  from .brightcove import BrightcoveIE @@ -29,6 +31,17 @@ class GenericIE(InfoExtractor):                  u"title": u"R\u00e9gis plante sa Jeep"              }          }, +        # embedded vimeo video +        { +            u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', +            u'file': u'22444065.mp4', +            u'md5': u'2903896e23df39722c33f015af0666e2', +            u'info_dict': { +                u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', +                u"uploader_id": u"skillsmatter", +                u"uploader": u"Skills Matter", +            } +        }      ]      def report_download_webpage(self, video_id): @@ -121,12 +134,20 @@ class GenericIE(InfoExtractor):          self.report_extraction(video_id)          # Look for BrightCove: -        m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) +        m_brightcove = re.search(r'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)          if m_brightcove is not None:              self.to_screen(u'Brightcove video detected.')              bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())              return self.url_result(bc_url, 'Brightcove') +        # Look for embedded Vimeo player +        mobj = re.search( +            r'<iframe\s+src="(https?://player.vimeo.com/video/.*?)"', webpage) +        if mobj: +            player_url = unescapeHTML(mobj.group(1)) +            surl = smuggle_url(player_url, {'Referer': url}) +            return self.url_result(surl, 'Vimeo') +          # Start with something easy: JW Player in SWFObject          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if mobj is None: diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py new file mode 100644 index 000000000..5986459d6 --- /dev/null +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -0,0 +1,87 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( +    compat_urlparse, +    compat_urllib_parse, +    xpath_with_ns, +    determine_ext, +) + + +class InternetVideoArchiveIE(InfoExtractor): +    _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + +    _TEST = { +        u'url': u'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', +        u'file': u'452693.mp4', +        u'info_dict': { +            u'title': u'SKYFALL', +            u'description': u'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', +            u'duration': 156, +        }, +    } + +    @staticmethod +    def _build_url(query): +        return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + +    @staticmethod +    def _clean_query(query): +        NEEDED_ARGS = ['publishedid', 'customerid'] +        query_dic = compat_urlparse.parse_qs(query) +        cleaned_dic = dict((k,v[0]) for (k,v) in query_dic.items() if k in NEEDED_ARGS) +        # Other player ids return m3u8 urls +        cleaned_dic['playerid'] = '247' +        cleaned_dic['videokbrate'] = '100000' +        return compat_urllib_parse.urlencode(cleaned_dic) + +    def _real_extract(self, url): +        query = compat_urlparse.urlparse(url).query +        query_dic = compat_urlparse.parse_qs(query) +        video_id = query_dic['publishedid'][0] +        url = self._build_url(query) + +        flashconfiguration_xml = self._download_webpage(url, video_id, +            u'Downloading flash configuration') +        flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8')) +        file_url = flashconfiguration.find('file').text +        file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') +        # Replace some of the parameters in the query to get the best quality +        # and http links (no m3u8 manifests) +        file_url = re.sub(r'(?<=\?)(.+)$', +            lambda m: self._clean_query(m.group()), +            file_url) +        info_xml = self._download_webpage(file_url, video_id, +            u'Downloading video info') +        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')) +        item = info.find('channel/item') + +        def _bp(p): +            return xpath_with_ns(p, +                {'media': 'http://search.yahoo.com/mrss/', +                'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'}) +        formats = [] +        for content in item.findall(_bp('media:group/media:content')): +            attr = content.attrib +            f_url = attr['url'] +            formats.append({ +                'url': f_url, +                'ext': determine_ext(f_url), +                'width': int(attr['width']), +                'bitrate': int(attr['bitrate']), +            }) +        formats = sorted(formats, key=lambda f: f['bitrate']) + +        info = { +            'id': video_id, +            'title': item.find('title').text, +            'formats': formats, +            'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], +            'description': item.find('description').text, +            'duration': int(attr['duration']), +        } +        # TODO: Remove when #980 has been merged +        info.update(formats[-1]) +        return info diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py new file mode 100644 index 000000000..ab52ad401 --- /dev/null +++ b/youtube_dl/extractor/nowvideo.py @@ -0,0 +1,43 @@ +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class NowVideoIE(InfoExtractor): +    _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.ch/video/(?P<id>\w+)' +    _TEST = { +        u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa', +        u'file': u'0mw0yow7b6dxa.flv', +        u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817', +        u'info_dict': { +            u"title": u"youtubedl test video _BaW_jenozKc.mp4" +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        video_id = mobj.group('id') +        webpage_url = 'http://www.nowvideo.ch/video/' + video_id +        webpage = self._download_webpage(webpage_url, video_id) + +        self.report_extraction(video_id) + +        video_title = self._html_search_regex(r'<h4>(.*)</h4>', +            webpage, u'video title') + +        video_key = self._search_regex(r'var fkzd="(.*)";', +            webpage, u'video key') + +        api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key) +        api_response = self._download_webpage(api_call, video_id, +            u'Downloading API page') +        video_url = compat_urlparse.parse_qs(api_response)[u'url'][0] + +        return [{ +            'id':        video_id, +            'url':       video_url, +            'ext':       'flv', +            'title':     video_title, +        }] diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py new file mode 100644 index 000000000..c79c39413 --- /dev/null +++ b/youtube_dl/extractor/rottentomatoes.py @@ -0,0 +1,16 @@ +from .videodetective import VideoDetectiveIE + + +# It just uses the same method as videodetective.com, +# the internetvideoarchive.com is extracted from the og:video property +class RottenTomatoesIE(VideoDetectiveIE): +    _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', +        u'file': '613340.mp4', +        u'info_dict': { +            u'title': u'TOY STORY 3', +            u'description': u'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', +        }, +    } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py new file mode 100644 index 000000000..a18034fe2 --- /dev/null +++ b/youtube_dl/extractor/rutube.py @@ -0,0 +1,58 @@ +# encoding: utf-8 +import re +import json + +from .common import InfoExtractor +from ..utils import ( +    compat_urlparse, +    compat_str, +    ExtractorError, +) + + +class RutubeIE(InfoExtractor): +    _VALID_URL = r'https?://rutube.ru/video/(?P<long_id>\w+)' + +    _TEST = { +        u'url': u'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', +        u'file': u'3eac3b4561676c17df9132a9a1e62e3e.mp4', +        u'info_dict': { +            u'title': u'Раненный кенгуру забежал в аптеку', +            u'uploader': u'NTDRussian', +            u'uploader_id': u'29790', +        }, +        u'params': { +            # It requires ffmpeg (m3u8 download) +            u'skip_download': True, +        }, +    } + +    def _get_api_response(self, short_id, subpath): +        api_url = 'http://rutube.ru/api/play/%s/%s/?format=json' % (subpath, short_id) +        response_json = self._download_webpage(api_url, short_id, +            u'Downloading %s json' % subpath) +        return json.loads(response_json) + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        long_id = mobj.group('long_id') +        webpage = self._download_webpage(url, long_id) +        og_video = self._og_search_video_url(webpage) +        short_id = compat_urlparse.urlparse(og_video).path[1:] +        options = self._get_api_response(short_id, 'options') +        trackinfo = self._get_api_response(short_id, 'trackinfo') +        # Some videos don't have the author field +        author = trackinfo.get('author') or {} +        m3u8_url = trackinfo['video_balancer'].get('m3u8') +        if m3u8_url is None: +            raise ExtractorError(u'Couldn\'t find m3u8 manifest url') + +        return { +            'id': trackinfo['id'], +            'title': trackinfo['title'], +            'url': m3u8_url, +            'ext': 'mp4', +            'thumbnail': options['thumbnail_url'], +            'uploader': author.get('name'), +            'uploader_id': compat_str(author['id']) if author else None, +        } diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py new file mode 100644 index 000000000..81fa35c4b --- /dev/null +++ b/youtube_dl/extractor/sztvhu.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class SztvHuIE(InfoExtractor): +    _VALID_URL = r'(?:http://)?(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' +    _TEST = { +        u'url': u'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', +        u'file': u'20130909.mp4', +        u'md5': u'a6df607b11fb07d0e9f2ad94613375cb', +        u'info_dict': { +            u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren", +            u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        video_file = self._search_regex( +            r'file: "...:(.*?)",', webpage, 'video file') +        title = self._html_search_regex( +            r'<meta name="title" content="([^"]*?) - [^-]*? - [^-]*?"', +            webpage, 'video title') +        description = self._html_search_regex( +            r'<meta name="description" content="([^"]*)"/>', +            webpage, 'video description', fatal=False) +        thumbnail = self._og_search_thumbnail(webpage) + +        video_url = 'http://media.sztv.hu/vod/' + video_file + +        return { +            'id': video_id, +            'url': video_url, +            'title': title, +            'ext': determine_ext(video_url), +            'description': description, +            'thumbnail': thumbnail, +        } diff --git a/youtube_dl/extractor/techtalks.py b/youtube_dl/extractor/techtalks.py new file mode 100644 index 000000000..a55f236cb --- /dev/null +++ b/youtube_dl/extractor/techtalks.py @@ -0,0 +1,65 @@ +import re + +from .common import InfoExtractor +from ..utils import ( +    get_element_by_attribute, +    clean_html, +) + + +class TechTalksIE(InfoExtractor): +    _VALID_URL = r'https?://techtalks\.tv/talks/[^/]*/(?P<id>\d+)/' + +    _TEST = { +        u'url': u'http://techtalks.tv/talks/learning-topic-models-going-beyond-svd/57758/', +        u'playlist': [ +            { +                u'file': u'57758.flv', +                u'info_dict': { +                    u'title': u'Learning Topic Models --- Going beyond SVD', +                }, +            }, +            { +                u'file': u'57758-slides.flv', +                u'info_dict': { +                    u'title': u'Learning Topic Models --- Going beyond SVD', +                }, +            }, +        ], +        u'params': { +            # rtmp download +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        talk_id = mobj.group('id') +        webpage = self._download_webpage(url, talk_id) +        rtmp_url = self._search_regex(r'netConnectionUrl: \'(.*?)\'', webpage, +            u'rtmp url') +        play_path = self._search_regex(r'href=\'(.*?)\' [^>]*id="flowplayer_presenter"', +            webpage, u'presenter play path') +        title = clean_html(get_element_by_attribute('class', 'title', webpage)) +        video_info = { +                'id': talk_id, +                'title': title, +                'url': rtmp_url, +                'play_path': play_path, +                'ext': 'flv', +            } +        m_slides = re.search(r'<a class="slides" href=\'(.*?)\'', webpage) +        if m_slides is None: +            return video_info +        else: +            return [ +                video_info, +                # The slides video +                { +                    'id': talk_id + '-slides', +                    'title': title, +                    'url': rtmp_url, +                    'play_path': m_slides.group(1), +                    'ext': 'flv', +                }, +            ] diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 1405b73f7..79679a14a 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -7,15 +7,25 @@ from .common import InfoExtractor  class TudouIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' -    _TEST = { +    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs|albumplay)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?' +    _TESTS = [{          u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',          u'file': u'159448201.f4v',          u'md5': u'140a49ed444bd22f93330985d8475fcb',          u'info_dict': {              u"title": u"卡马乔国足开大脚长传冲吊集锦"          } -    } +    }, +    { +        u'url': u'http://www.tudou.com/albumplay/TenTw_JgiPM/PzsAs5usU9A.html', +        u'file': u'todo.mp4', +        u'md5': u'todo.mp4', +        u'info_dict': { +            u'title': u'todo.mp4', +        }, +        u'add_ie': [u'Youku'], +        u'skip': u'Only works from China' +    }]      def _url_for_id(self, id, quality = None):          info_url = "http://v2.tudou.com/f?id="+str(id) @@ -29,14 +39,18 @@ class TudouIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group(2)          webpage = self._download_webpage(url, video_id) -        title = re.search(",kw:\"(.+)\"",webpage) -        if title is None: -            title = re.search(",kw: \'(.+)\'",webpage) -        title = title.group(1) -        thumbnail_url = re.search(",pic: \'(.+?)\'",webpage) -        if thumbnail_url is None: -            thumbnail_url = re.search(",pic:\"(.+?)\"",webpage) -        thumbnail_url = thumbnail_url.group(1) + +        m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) +        if m and m.group(1): +            return { +                '_type': 'url', +                'url': u'youku:' + m.group(1), +                'ie_key': 'Youku' +            } + +        title = self._search_regex(r",kw:['\"](.+?)[\"']", webpage, u'title') +        thumbnail_url = self._search_regex( +            r",pic:\s*[\"'](.+?)[\"']", webpage, u'thumbnail URL', fatal=False)          segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')          segments = json.loads(segs_json) diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py new file mode 100644 index 000000000..d89f84094 --- /dev/null +++ b/youtube_dl/extractor/videodetective.py @@ -0,0 +1,30 @@ +import re + +from .common import InfoExtractor +from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import ( +    compat_urlparse, +) + + +class VideoDetectiveIE(InfoExtractor): +    _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)' + +    _TEST = { +        u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487', +        u'file': u'194487.mp4', +        u'info_dict': { +            u'title': u'KICK-ASS 2', +            u'description': u'md5:65ba37ad619165afac7d432eaded6013', +            u'duration': 138, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        webpage = self._download_webpage(url, video_id) +        og_video = self._og_search_video_url(webpage) +        query = compat_urlparse.urlparse(og_video).query +        return self.url_result(InternetVideoArchiveIE._build_url(query), +            ie=InternetVideoArchiveIE.ie_key()) diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py new file mode 100644 index 000000000..65f39b982 --- /dev/null +++ b/youtube_dl/extractor/videopremium.py @@ -0,0 +1,40 @@ +import re +import random + +from .common import InfoExtractor + + +class VideoPremiumIE(InfoExtractor): +    _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?' +    _TEST = { +        u'url': u'http://videopremium.tv/4w7oadjsf156', +        u'file': u'4w7oadjsf156.f4v', +        u'info_dict': { +            u"title": u"youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4" +        }, +        u'params': { +            u'skip_download': True, +        }, +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) + +        video_id = mobj.group('id') +        webpage_url = 'http://videopremium.tv/' + video_id +        webpage = self._download_webpage(webpage_url, video_id) + +        self.report_extraction(video_id) + +        video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<', +            webpage, u'video title') + +        return [{ +            'id':          video_id, +            'url':         "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16), +            'play_path':   "mp4:%s.f4v" % video_id, +            'page_url':    "http://videopremium.tv/" + video_id, +            'player_url':  "http://videopremium.tv/uplayer/uppod.swf", +            'ext':         'f4v', +            'title':       video_title, +        }] diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cea29f035..2de56ac81 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -11,6 +11,7 @@ from ..utils import (      get_element_by_attribute,      ExtractorError,      std_headers, +    unsmuggle_url,  )  class VimeoIE(InfoExtractor): @@ -53,7 +54,7 @@ class VimeoIE(InfoExtractor):                  u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software',                  u'uploader': u'The BLN & Business of Software',              }, -        }, +        }      ]      def _login(self): @@ -98,6 +99,12 @@ class VimeoIE(InfoExtractor):          self._login()      def _real_extract(self, url, new_video=True): +        url, data = unsmuggle_url(url) +        headers = std_headers +        if data is not None: +            headers = headers.copy() +            headers.update(data) +          # Extract ID from URL          mobj = re.match(self._VALID_URL, url)          if mobj is None: @@ -112,7 +119,7 @@ class VimeoIE(InfoExtractor):              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information -        request = compat_urllib_request.Request(url, None, std_headers) +        request = compat_urllib_request.Request(url, None, headers)          webpage = self._download_webpage(request, video_id)          # Now we begin extracting as much information as we can from what we diff --git a/youtube_dl/extractor/websurg.py b/youtube_dl/extractor/websurg.py new file mode 100644 index 000000000..43953bfdd --- /dev/null +++ b/youtube_dl/extractor/websurg.py @@ -0,0 +1,59 @@ +# coding: utf-8 + +import re + +from ..utils import ( +    compat_urllib_request, +    compat_urllib_parse +) + +from .common import InfoExtractor + +class WeBSurgIE(InfoExtractor): +    IE_NAME = u'websurg.com' +    _VALID_URL = r'http://.*?\.websurg\.com/MEDIA/\?noheader=1&doi=(.*)' + +    _TEST = { +        u'url': u'http://www.websurg.com/MEDIA/?noheader=1&doi=vd01en4012', +        u'file': u'vd01en4012.mp4', +        u'params': { +            u'skip_download': True, +        }, +        u'skip': u'Requires login information', +    } +     +    _LOGIN_URL = 'http://www.websurg.com/inc/login/login_div.ajax.php?login=1' + +    def _real_initialize(self): + +        login_form = { +            'username': self._downloader.params['username'], +            'password': self._downloader.params['password'], +            'Submit': 1 +        } +         +        request = compat_urllib_request.Request( +            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form)) +        request.add_header( +            'Content-Type', 'application/x-www-form-urlencoded;charset=utf-8') +        compat_urllib_request.urlopen(request).info() +        webpage = self._download_webpage(self._LOGIN_URL, '', 'Logging in') +         +        if webpage != 'OK': +            self._downloader.report_error( +                u'Unable to log in: bad username/password') +         +    def _real_extract(self, url): +        video_id = re.match(self._VALID_URL, url).group(1) +         +        webpage = self._download_webpage(url, video_id) +         +        url_info = re.search(r'streamer="(.*?)" src="(.*?)"', webpage) +         +        return {'id': video_id, +                'title': self._og_search_title(webpage), +                'description': self._og_search_description(webpage), +                'ext' : 'mp4', +                'url' : url_info.group(1) + '/' + url_info.group(2), +                'thumbnail': self._og_search_thumbnail(webpage) +                } diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 00fa2ccb5..9d88c17f5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -13,7 +13,7 @@ from ..utils import (  class YoukuIE(InfoExtractor): -    _VALID_URL =  r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)' +    _VALID_URL =  r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)'      _TEST =   {          u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",          u"file": u"XNDgyMDQ2NTQw_part00.flv", diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8222a880f..fb7c42830 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1150,7 +1150,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              list_page = self._download_webpage(list_url, video_id)              caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))              original_lang_node = caption_list.find('track') -            if original_lang_node.attrib.get('kind') != 'asr' : +            if not original_lang_node or original_lang_node.attrib.get('kind') != 'asr' :                  self._downloader.report_warning(u'Video doesn\'t have automatic captions')                  return {}              original_lang = original_lang_node.attrib['lang_code'] @@ -1250,6 +1250,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              url_map[itag] = format_url          return url_map +    def _extract_annotations(self, video_id): +        url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id +        return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') +      def _real_extract(self, url):          # Extract original video URL from URL with redirection, like age verification, using next_url parameter          mobj = re.search(self._NEXT_URL_RE, url) @@ -1382,6 +1386,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          else:              video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) +        # annotations +        video_annotations = None +        if self._downloader.params.get('writeannotations', False): +                video_annotations = self._extract_annotations(video_id) +          # Decide which formats to download          try: @@ -1495,6 +1504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                  'subtitles':    video_subtitles,                  'duration':     video_duration,                  'age_limit':    18 if age_gate else 0, +                'annotations':  video_annotations              })          return results diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 82a1daeb9..833f981f2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -9,6 +9,7 @@ import io  import json  import locale  import os +import pipes  import platform  import re  import socket @@ -229,6 +230,19 @@ else:                  return f          return None +# On python2.6 the xml.etree.ElementTree.Element methods don't support +# the namespace parameter +def xpath_with_ns(path, ns_map): +    components = [c.split(':') for c in path.split('/')] +    replaced = [] +    for c in components: +        if len(c) == 1: +            replaced.append(c[0]) +        else: +            ns, tag = c +            replaced.append('{%s}%s' % (ns_map[ns], tag)) +    return '/'.join(replaced) +  def htmlentity_transform(matchobj):      """Transforms an HTML entity to a character. @@ -927,3 +941,24 @@ class locked_file(object):      def read(self, *args):          return self.f.read(*args) + + +def shell_quote(args): +    return ' '.join(map(pipes.quote, args)) + + +def smuggle_url(url, data): +    """ Pass additional data in a URL for internal use. """ + +    sdata = compat_urllib_parse.urlencode( +        {u'__youtubedl_smuggle': json.dumps(data)}) +    return url + u'#' + sdata + + +def unsmuggle_url(smug_url): +    if not '#__youtubedl_smuggle' in smug_url: +        return smug_url, None +    url, _, sdata = smug_url.rpartition(u'#') +    jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0] +    data = json.loads(jsond) +    return url, data diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1004af116..22a51ffe6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.10.09' +__version__ = '2013.10.17' | 
