diff options
64 files changed, 2526 insertions, 1797 deletions
| diff --git a/.gitignore b/.gitignore index 7dd0ad09b..37b2fa8d3 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,8 @@ updates_key.pem  *.vtt  *.flv  *.mp4 +*.m4a +*.m4v  *.part  test/testdata  .tox @@ -34,12 +34,16 @@ which means you can modify it, redistribute it or use it however you like.                                 empty string (--proxy "") for direct connection      --no-check-certificate     Suppress HTTPS certificate validation.      --cache-dir DIR            Location in the filesystem where youtube-dl can -                               store downloaded information permanently. By +                               store some downloaded information permanently. By                                 default $XDG_CACHE_HOME/youtube-dl or ~/.cache -                               /youtube-dl . +                               /youtube-dl . At the moment, only YouTube player +                               files (for videos with obfuscated signatures) are +                               cached, but that may change.      --no-cache-dir             Disable filesystem caching +    --socket-timeout None      Time to wait before giving up, in seconds      --bidi-workaround          Work around terminals that lack bidirectional -                               text support. Requires fribidi executable in PATH +                               text support. Requires bidiv or fribidi +                               executable in PATH  ## Video Selection:      --playlist-start NUMBER    playlist video to start at (default is 1) @@ -336,3 +340,7 @@ In particular, every site support request issue should only pertain to services  ###  Is anyone going to need the feature?  Only post features that you (or an incapicated friend you can personally talk to) require. Do not post features because they seem like a good idea. If they are really useful, they will be requested by someone who requires them. + +###  Is your question about youtube-dl? + +It may sound strange, but some bug reports we receive are completely unrelated to youtube-dl and relate to a different or even the reporter's own application. Please make sure that you are actually using youtube-dl. If you are using a UI for youtube-dl, report the bug to the maintainer of the actual application providing the UI. On the other hand, if your UI for youtube-dl fails in some way you believe is related to youtube-dl, by all means, go ahead and report the bug. diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in index 3af87a378..28bd23727 100644 --- a/devscripts/bash-completion.in +++ b/devscripts/bash-completion.in @@ -6,7 +6,7 @@ __youtube_dl()      prev="${COMP_WORDS[COMP_CWORD-1]}"      opts="{{flags}}"      keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" -    fileopts="-a|--batch-file|--download-archive|--cookies" +    fileopts="-a|--batch-file|--download-archive|--cookies|--load-info"      diropts="--cache-dir"      if [[ ${prev} =~ ${fileopts} ]]; then diff --git a/devscripts/gh-pages/update-feed.py b/devscripts/gh-pages/update-feed.py index 16571a924..0ba15ae0f 100755 --- a/devscripts/gh-pages/update-feed.py +++ b/devscripts/gh-pages/update-feed.py @@ -1,56 +1,76 @@  #!/usr/bin/env python3  import datetime - +import io +import json  import textwrap -import json -atom_template=textwrap.dedent("""\ -								<?xml version='1.0' encoding='utf-8'?> -								<atom:feed xmlns:atom="http://www.w3.org/2005/Atom"> -									<atom:title>youtube-dl releases</atom:title> -									<atom:id>youtube-dl-updates-feed</atom:id> -									<atom:updated>@TIMESTAMP@</atom:updated> -									@ENTRIES@ -								</atom:feed>""") - -entry_template=textwrap.dedent(""" -								<atom:entry> -									<atom:id>youtube-dl-@VERSION@</atom:id> -									<atom:title>New version @VERSION@</atom:title> -									<atom:link href="http://rg3.github.io/youtube-dl" /> -									<atom:content type="xhtml"> -										<div xmlns="http://www.w3.org/1999/xhtml"> -											Downloads available at <a href="https://yt-dl.org/downloads/@VERSION@/">https://yt-dl.org/downloads/@VERSION@/</a> -										</div> -									</atom:content> -									<atom:author> -										<atom:name>The youtube-dl maintainers</atom:name> -									</atom:author> -									<atom:updated>@TIMESTAMP@</atom:updated> -								</atom:entry> -								""") +atom_template = textwrap.dedent("""\ +    <?xml version="1.0" encoding="utf-8"?> +    <feed xmlns="http://www.w3.org/2005/Atom"> +        <link rel="self" href="http://rg3.github.io/youtube-dl/update/releases.atom" /> +        <title>youtube-dl releases</title> +        <id>https://yt-dl.org/feed/youtube-dl-updates-feed</id> +        <updated>@TIMESTAMP@</updated> +        @ENTRIES@ +    </feed>""") -now = datetime.datetime.now() -now_iso = now.isoformat() +entry_template = textwrap.dedent(""" +    <entry> +        <id>https://yt-dl.org/feed/youtube-dl-updates-feed/youtube-dl-@VERSION@</id> +        <title>New version @VERSION@</title> +        <link href="http://rg3.github.io/youtube-dl" /> +        <content type="xhtml"> +            <div xmlns="http://www.w3.org/1999/xhtml"> +                Downloads available at <a href="https://yt-dl.org/downloads/@VERSION@/">https://yt-dl.org/downloads/@VERSION@/</a> +            </div> +        </content> +        <author> +            <name>The youtube-dl maintainers</name> +        </author> +        <updated>@TIMESTAMP@</updated> +    </entry> +    """) -atom_template = atom_template.replace('@TIMESTAMP@',now_iso) +now = datetime.datetime.now() +now_iso = now.isoformat() + 'Z' -entries=[] +atom_template = atom_template.replace('@TIMESTAMP@', now_iso)  versions_info = json.load(open('update/versions.json'))  versions = list(versions_info['versions'].keys())  versions.sort() +entries = []  for v in versions: -	entry = entry_template.replace('@TIMESTAMP@',v.replace('.','-')) -	entry = entry.replace('@VERSION@',v) -	entries.append(entry) +    fields = v.split('.') +    year, month, day = map(int, fields[:3]) +    faked = 0 +    patchlevel = 0 +    while True: +        try: +            datetime.date(year, month, day) +        except ValueError: +            day -= 1 +            faked += 1 +            assert day > 0 +            continue +        break +    if len(fields) >= 4: +        try: +            patchlevel = int(fields[3]) +        except ValueError: +            patchlevel = 1 +    timestamp = '%04d-%02d-%02dT00:%02d:%02dZ' % (year, month, day, faked, patchlevel) + +    entry = entry_template.replace('@TIMESTAMP@', timestamp) +    entry = entry.replace('@VERSION@', v) +    entries.append(entry)  entries_str = textwrap.indent(''.join(entries), '\t')  atom_template = atom_template.replace('@ENTRIES@', entries_str) -with open('update/releases.atom','w',encoding='utf-8') as atom_file: -	atom_file.write(atom_template) +with io.open('update/releases.atom', 'w', encoding='utf-8') as atom_file: +    atom_file.write(atom_template) diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index 7f2ea319c..cae1fa4f2 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -1,20 +1,24 @@ +import io  import sys  import re  README_FILE = 'README.md'  helptext = sys.stdin.read() -with open(README_FILE) as f: +if isinstance(helptext, bytes): +    helptext = helptext.decode('utf-8') + +with io.open(README_FILE, encoding='utf-8') as f:      oldreadme = f.read()  header = oldreadme[:oldreadme.index('# OPTIONS')]  footer = oldreadme[oldreadme.index('# CONFIGURATION'):] -options = helptext[helptext.index('  General Options:')+19:] +options = helptext[helptext.index('  General Options:') + 19:]  options = re.sub(r'^  (\w.+)$', r'## \1', options, flags=re.M)  options = '# OPTIONS\n' + options + '\n' -with open(README_FILE, 'w') as f: +with io.open(README_FILE, 'w', encoding='utf-8') as f:      f.write(header)      f.write(options)      f.write(footer) @@ -71,7 +71,7 @@ setup(      author_email='ytdl@yt-dl.org',      maintainer='Philipp Hagemeister',      maintainer_email='phihag@phihag.de', -    packages=['youtube_dl', 'youtube_dl.extractor'], +    packages=['youtube_dl', 'youtube_dl.extractor', 'youtube_dl.downloader'],      # Provokes warning on most systems (why?!)      # test_suite = 'nose.collector', diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3100c362a..01de10e31 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -8,6 +8,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  from test.helper import FakeYDL  from youtube_dl import YoutubeDL +from youtube_dl.extractor import YoutubeIE  class YDL(FakeYDL): @@ -33,6 +34,8 @@ class TestFormatSelection(unittest.TestCase):              {u'ext': u'mp4',  u'height': 460},          ]          info_dict = {u'formats': formats, u'extractor': u'test'} +        yie = YoutubeIE(ydl) +        yie._sort_formats(info_dict['formats'])          ydl.process_ie_result(info_dict)          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded[u'ext'], u'webm') @@ -45,28 +48,46 @@ class TestFormatSelection(unittest.TestCase):              {u'ext': u'mp4', u'height': 1080},          ]          info_dict[u'formats'] = formats +        yie = YoutubeIE(ydl) +        yie._sort_formats(info_dict['formats'])          ydl.process_ie_result(info_dict)          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded[u'ext'], u'mp4') -        # No prefer_free_formats => keep original formats order +        # No prefer_free_formats => prefer mp4 and flv for greater compatibilty          ydl = YDL()          ydl.params['prefer_free_formats'] = False          formats = [              {u'ext': u'webm', u'height': 720}, +            {u'ext': u'mp4', u'height': 720},              {u'ext': u'flv', u'height': 720},          ]          info_dict[u'formats'] = formats +        yie = YoutubeIE(ydl) +        yie._sort_formats(info_dict['formats']) +        ydl.process_ie_result(info_dict) +        downloaded = ydl.downloaded_info_dicts[0] +        self.assertEqual(downloaded[u'ext'], u'mp4') + +        ydl = YDL() +        ydl.params['prefer_free_formats'] = False +        formats = [ +            {u'ext': u'flv', u'height': 720}, +            {u'ext': u'webm', u'height': 720}, +        ] +        info_dict[u'formats'] = formats +        yie = YoutubeIE(ydl) +        yie._sort_formats(info_dict['formats'])          ydl.process_ie_result(info_dict)          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded[u'ext'], u'flv')      def test_format_limit(self):          formats = [ -            {u'format_id': u'meh', u'url': u'http://example.com/meh'}, -            {u'format_id': u'good', u'url': u'http://example.com/good'}, -            {u'format_id': u'great', u'url': u'http://example.com/great'}, -            {u'format_id': u'excellent', u'url': u'http://example.com/exc'}, +            {u'format_id': u'meh', u'url': u'http://example.com/meh', 'preference': 1}, +            {u'format_id': u'good', u'url': u'http://example.com/good', 'preference': 2}, +            {u'format_id': u'great', u'url': u'http://example.com/great', 'preference': 3}, +            {u'format_id': u'excellent', u'url': u'http://example.com/exc', 'preference': 4},          ]          info_dict = {              u'formats': formats, u'extractor': u'test', 'id': 'testvid'} @@ -78,12 +99,12 @@ class TestFormatSelection(unittest.TestCase):          ydl = YDL({'format_limit': 'good'})          assert ydl.params['format_limit'] == 'good' -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded[u'format_id'], u'good')          ydl = YDL({'format_limit': 'great', 'format': 'all'}) -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          self.assertEqual(ydl.downloaded_info_dicts[0][u'format_id'], u'meh')          self.assertEqual(ydl.downloaded_info_dicts[1][u'format_id'], u'good')          self.assertEqual(ydl.downloaded_info_dicts[2][u'format_id'], u'great') @@ -91,44 +112,80 @@ class TestFormatSelection(unittest.TestCase):          ydl = YDL()          ydl.params['format_limit'] = 'excellent' -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded[u'format_id'], u'excellent')      def test_format_selection(self):          formats = [ -            {u'format_id': u'35', u'ext': u'mp4'}, -            {u'format_id': u'45', u'ext': u'webm'}, -            {u'format_id': u'47', u'ext': u'webm'}, -            {u'format_id': u'2', u'ext': u'flv'}, +            {u'format_id': u'35', u'ext': u'mp4', 'preference': 1}, +            {u'format_id': u'45', u'ext': u'webm', 'preference': 2}, +            {u'format_id': u'47', u'ext': u'webm', 'preference': 3}, +            {u'format_id': u'2', u'ext': u'flv', 'preference': 4},          ]          info_dict = {u'formats': formats, u'extractor': u'test'}          ydl = YDL({'format': u'20/47'}) -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'47')          ydl = YDL({'format': u'20/71/worst'}) -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'35')          ydl = YDL() -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'2')          ydl = YDL({'format': u'webm/mp4'}) -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'47')          ydl = YDL({'format': u'3gp/40/mp4'}) -        ydl.process_ie_result(info_dict) +        ydl.process_ie_result(info_dict.copy())          downloaded = ydl.downloaded_info_dicts[0]          self.assertEqual(downloaded['format_id'], u'35') +    def test_youtube_format_selection(self): +        order = [ +            '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13', +            # Apple HTTP Live Streaming +            '96', '95', '94', '93', '92', '132', '151', +            # 3D +            '85', '84', '102', '83', '101', '82', '100', +            # Dash video +            '138', '137', '248', '136', '247', '135', '246', +            '245', '244', '134', '243', '133', '242', '160', +            # Dash audio +            '141', '172', '140', '139', '171', +        ] + +        for f1id, f2id in zip(order, order[1:]): +            f1 = YoutubeIE._formats[f1id].copy() +            f1['format_id'] = f1id +            f2 = YoutubeIE._formats[f2id].copy() +            f2['format_id'] = f2id + +            info_dict = {'formats': [f1, f2], 'extractor': 'youtube'} +            ydl = YDL() +            yie = YoutubeIE(ydl) +            yie._sort_formats(info_dict['formats']) +            ydl.process_ie_result(info_dict) +            downloaded = ydl.downloaded_info_dicts[0] +            self.assertEqual(downloaded['format_id'], f1id) + +            info_dict = {'formats': [f2, f1], 'extractor': 'youtube'} +            ydl = YDL() +            yie = YoutubeIE(ydl) +            yie._sort_formats(info_dict['formats']) +            ydl.process_ie_result(info_dict) +            downloaded = ydl.downloaded_info_dicts[0] +            self.assertEqual(downloaded['format_id'], f1id) +      def test_add_extra_info(self):          test_dict = {              'extractor': 'Foo', diff --git a/test/test_download.py b/test/test_download.py index dd5818dba..d0be8d27c 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -90,7 +90,7 @@ def generator(test_case):          def _hook(status):              if status['status'] == 'finished':                  finished_hook_called.add(status['filename']) -        ydl.fd.add_progress_hook(_hook) +        ydl.add_progress_hook(_hook)          def get_tc_filename(tc):              return tc.get('file') or ydl.prepare_filename(tc.get('info_dict', {})) diff --git a/test/test_playlists.py b/test/test_playlists.py index 1b7b4e3d8..9d522b357 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -28,7 +28,8 @@ from youtube_dl.extractor import (      BandcampAlbumIE,      SmotriCommunityIE,      SmotriUserIE, -    IviCompilationIE +    IviCompilationIE, +    ImdbListIE,  ) @@ -187,6 +188,15 @@ class TestPlaylists(unittest.TestCase):          self.assertEqual(result['id'], u'dezhurnyi_angel/season2')          self.assertEqual(result['title'], u'Дежурный ангел (2010 - 2012) 2 сезон')          self.assertTrue(len(result['entries']) >= 20) +         +    def test_imdb_list(self): +        dl = FakeYDL() +        ie = ImdbListIE(dl) +        result = ie.extract('http://www.imdb.com/list/sMjedvGDd8U') +        self.assertIsPlaylist(result) +        self.assertEqual(result['id'], u'sMjedvGDd8U') +        self.assertEqual(result['title'], u'Animated and Family Films') +        self.assertTrue(len(result['entries']) >= 48)  if __name__ == '__main__': diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 23a653124..263b5ac69 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -36,10 +36,6 @@ class TestYoutubeSubtitles(BaseTestSubtitles):      url = 'QRS8MkLhQmM'      IE = YoutubeIE -    def getSubtitles(self): -        info_dict = self.getInfoDict() -        return info_dict[0]['subtitles'] -      def test_youtube_no_writesubtitles(self):          self.DL.params['writesubtitles'] = False          subtitles = self.getSubtitles() diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py new file mode 100644 index 000000000..a4ba7bad0 --- /dev/null +++ b/test/test_unicode_literals.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +import io +import os +import re +import unittest + +rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +IGNORED_FILES = [ +    'setup.py',  # http://bugs.python.org/issue13943 +] + + +class TestUnicodeLiterals(unittest.TestCase): +    def test_all_files(self): +        print('Skipping this test (not yet fully implemented)') +        return + +        for dirpath, _, filenames in os.walk(rootDir): +            for basename in filenames: +                if not basename.endswith('.py'): +                    continue +                if basename in IGNORED_FILES: +                    continue + +                fn = os.path.join(dirpath, basename) +                with io.open(fn, encoding='utf-8') as inf: +                    code = inf.read() + +                if "'" not in code and '"' not in code: +                    continue +                imps = 'from __future__ import unicode_literals' +                self.assertTrue( +                    imps in code, +                    ' %s  missing in %s' % (imps, fn)) + +                m = re.search(r'(?<=\s)u[\'"](?!\)|,|$)', code) +                if m is not None: +                    self.assertTrue( +                        m is None, +                        'u present in %s, around %s' % ( +                            fn, code[m.start() - 10:m.end() + 10])) + + +if __name__ == '__main__': +    unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index e5778cd83..bee355ee0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -18,6 +18,7 @@ from youtube_dl.utils import (      find_xpath_attr,      get_meta_content,      orderedSet, +    parse_duration,      sanitize_filename,      shell_quote,      smuggle_url, @@ -192,5 +193,12 @@ class TestUtil(unittest.TestCase):              url_basename(u'http://media.w3.org/2010/05/sintel/trailer.mp4'),              u'trailer.mp4') +    def test_parse_duration(self): +        self.assertEqual(parse_duration(None), None) +        self.assertEqual(parse_duration('1'), 1) +        self.assertEqual(parse_duration('1337:12'), 80232) +        self.assertEqual(parse_duration('9:12:43'), 33163) +        self.assertEqual(parse_duration('x:y'), None) +  if __name__ == '__main__':      unittest.main() diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 47124932f..5c8e676a2 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -1,724 +1,12 @@ -import os -import re -import subprocess -import sys -import time - -from .utils import ( -    compat_urllib_error, -    compat_urllib_request, -    ContentTooShortError, -    determine_ext, -    encodeFilename, -    format_bytes, -    sanitize_open, -    timeconvert, -) - - -class FileDownloader(object): -    """File Downloader class. - -    File downloader objects are the ones responsible of downloading the -    actual video file and writing it to disk. - -    File downloaders accept a lot of parameters. In order not to saturate -    the object constructor with arguments, it receives a dictionary of -    options instead. - -    Available options: - -    verbose:           Print additional info to stdout. -    quiet:             Do not print messages to stdout. -    ratelimit:         Download speed limit, in bytes/sec. -    retries:           Number of times to retry for HTTP error 5xx -    buffersize:        Size of download buffer in bytes. -    noresizebuffer:    Do not automatically resize the download buffer. -    continuedl:        Try to continue downloads if possible. -    noprogress:        Do not print the progress bar. -    logtostderr:       Log messages to stderr instead of stdout. -    consoletitle:      Display progress in console window's titlebar. -    nopart:            Do not use temporary .part files. -    updatetime:        Use the Last-modified header to set output file timestamps. -    test:              Download only first bytes to test the downloader. -    min_filesize:      Skip files smaller than this size -    max_filesize:      Skip files larger than this size -    """ - -    params = None - -    def __init__(self, ydl, params): -        """Create a FileDownloader object with the given options.""" -        self.ydl = ydl -        self._progress_hooks = [] -        self.params = params - -    @staticmethod -    def format_seconds(seconds): -        (mins, secs) = divmod(seconds, 60) -        (hours, mins) = divmod(mins, 60) -        if hours > 99: -            return '--:--:--' -        if hours == 0: -            return '%02d:%02d' % (mins, secs) -        else: -            return '%02d:%02d:%02d' % (hours, mins, secs) - -    @staticmethod -    def calc_percent(byte_counter, data_len): -        if data_len is None: -            return None -        return float(byte_counter) / float(data_len) * 100.0 - -    @staticmethod -    def format_percent(percent): -        if percent is None: -            return '---.-%' -        return '%6s' % ('%3.1f%%' % percent) - -    @staticmethod -    def calc_eta(start, now, total, current): -        if total is None: -            return None -        dif = now - start -        if current == 0 or dif < 0.001: # One millisecond -            return None -        rate = float(current) / dif -        return int((float(total) - float(current)) / rate) - -    @staticmethod -    def format_eta(eta): -        if eta is None: -            return '--:--' -        return FileDownloader.format_seconds(eta) - -    @staticmethod -    def calc_speed(start, now, bytes): -        dif = now - start -        if bytes == 0 or dif < 0.001: # One millisecond -            return None -        return float(bytes) / dif - -    @staticmethod -    def format_speed(speed): -        if speed is None: -            return '%10s' % '---b/s' -        return '%10s' % ('%s/s' % format_bytes(speed)) - -    @staticmethod -    def best_block_size(elapsed_time, bytes): -        new_min = max(bytes / 2.0, 1.0) -        new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB -        if elapsed_time < 0.001: -            return int(new_max) -        rate = bytes / elapsed_time -        if rate > new_max: -            return int(new_max) -        if rate < new_min: -            return int(new_min) -        return int(rate) - -    @staticmethod -    def parse_bytes(bytestr): -        """Parse a string indicating a byte quantity into an integer.""" -        matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) -        if matchobj is None: -            return None -        number = float(matchobj.group(1)) -        multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) -        return int(round(number * multiplier)) - -    def to_screen(self, *args, **kargs): -        self.ydl.to_screen(*args, **kargs) - -    def to_stderr(self, message): -        self.ydl.to_screen(message) - -    def to_console_title(self, message): -        self.ydl.to_console_title(message) - -    def trouble(self, *args, **kargs): -        self.ydl.trouble(*args, **kargs) - -    def report_warning(self, *args, **kargs): -        self.ydl.report_warning(*args, **kargs) - -    def report_error(self, *args, **kargs): -        self.ydl.report_error(*args, **kargs) - -    def slow_down(self, start_time, byte_counter): -        """Sleep if the download speed is over the rate limit.""" -        rate_limit = self.params.get('ratelimit', None) -        if rate_limit is None or byte_counter == 0: -            return -        now = time.time() -        elapsed = now - start_time -        if elapsed <= 0.0: -            return -        speed = float(byte_counter) / elapsed -        if speed > rate_limit: -            time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) - -    def temp_name(self, filename): -        """Returns a temporary filename for the given filename.""" -        if self.params.get('nopart', False) or filename == u'-' or \ -                (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): -            return filename -        return filename + u'.part' - -    def undo_temp_name(self, filename): -        if filename.endswith(u'.part'): -            return filename[:-len(u'.part')] -        return filename - -    def try_rename(self, old_filename, new_filename): -        try: -            if old_filename == new_filename: -                return -            os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) -        except (IOError, OSError): -            self.report_error(u'unable to rename file') - -    def try_utime(self, filename, last_modified_hdr): -        """Try to set the last-modified time of the given file.""" -        if last_modified_hdr is None: -            return -        if not os.path.isfile(encodeFilename(filename)): -            return -        timestr = last_modified_hdr -        if timestr is None: -            return -        filetime = timeconvert(timestr) -        if filetime is None: -            return filetime -        # Ignore obviously invalid dates -        if filetime == 0: -            return -        try: -            os.utime(filename, (time.time(), filetime)) -        except: -            pass -        return filetime - -    def report_destination(self, filename): -        """Report destination filename.""" -        self.to_screen(u'[download] Destination: ' + filename) - -    def _report_progress_status(self, msg, is_last_line=False): -        fullmsg = u'[download] ' + msg -        if self.params.get('progress_with_newline', False): -            self.to_screen(fullmsg) -        else: -            if os.name == 'nt': -                prev_len = getattr(self, '_report_progress_prev_line_length', -                                   0) -                if prev_len > len(fullmsg): -                    fullmsg += u' ' * (prev_len - len(fullmsg)) -                self._report_progress_prev_line_length = len(fullmsg) -                clear_line = u'\r' -            else: -                clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r') -            self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) -        self.to_console_title(u'youtube-dl ' + msg) - -    def report_progress(self, percent, data_len_str, speed, eta): -        """Report download progress.""" -        if self.params.get('noprogress', False): -            return -        if eta is not None: -            eta_str = self.format_eta(eta) -        else: -            eta_str = 'Unknown ETA' -        if percent is not None: -            percent_str = self.format_percent(percent) -        else: -            percent_str = 'Unknown %' -        speed_str = self.format_speed(speed) - -        msg = (u'%s of %s at %s ETA %s' % -               (percent_str, data_len_str, speed_str, eta_str)) -        self._report_progress_status(msg) - -    def report_progress_live_stream(self, downloaded_data_len, speed, elapsed): -        if self.params.get('noprogress', False): -            return -        downloaded_str = format_bytes(downloaded_data_len) -        speed_str = self.format_speed(speed) -        elapsed_str = FileDownloader.format_seconds(elapsed) -        msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str) -        self._report_progress_status(msg) - -    def report_finish(self, data_len_str, tot_time): -        """Report download finished.""" -        if self.params.get('noprogress', False): -            self.to_screen(u'[download] Download completed') -        else: -            self._report_progress_status( -                (u'100%% of %s in %s' % -                 (data_len_str, self.format_seconds(tot_time))), -                is_last_line=True) - -    def report_resuming_byte(self, resume_len): -        """Report attempt to resume at given byte.""" -        self.to_screen(u'[download] Resuming download at byte %s' % resume_len) - -    def report_retry(self, count, retries): -        """Report retry in case of HTTP error 5xx""" -        self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) - -    def report_file_already_downloaded(self, file_name): -        """Report file has already been fully downloaded.""" -        try: -            self.to_screen(u'[download] %s has already been downloaded' % file_name) -        except UnicodeEncodeError: -            self.to_screen(u'[download] The file has already been downloaded') - -    def report_unable_to_resume(self): -        """Report it was impossible to resume download.""" -        self.to_screen(u'[download] Unable to resume') - -    def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live, conn): -        def run_rtmpdump(args): -            start = time.time() -            resume_percent = None -            resume_downloaded_data_len = None -            proc = subprocess.Popen(args, stderr=subprocess.PIPE) -            cursor_in_new_line = True -            proc_stderr_closed = False -            while not proc_stderr_closed: -                # read line from stderr -                line = u'' -                while True: -                    char = proc.stderr.read(1) -                    if not char: -                        proc_stderr_closed = True -                        break -                    if char in [b'\r', b'\n']: -                        break -                    line += char.decode('ascii', 'replace') -                if not line: -                    # proc_stderr_closed is True -                    continue -                mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) -                if mobj: -                    downloaded_data_len = int(float(mobj.group(1))*1024) -                    percent = float(mobj.group(2)) -                    if not resume_percent: -                        resume_percent = percent -                        resume_downloaded_data_len = downloaded_data_len -                    eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent) -                    speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len) -                    data_len = None -                    if percent > 0: -                        data_len = int(downloaded_data_len * 100 / percent) -                    data_len_str = u'~' + format_bytes(data_len) -                    self.report_progress(percent, data_len_str, speed, eta) -                    cursor_in_new_line = False -                    self._hook_progress({ -                        'downloaded_bytes': downloaded_data_len, -                        'total_bytes': data_len, -                        'tmpfilename': tmpfilename, -                        'filename': filename, -                        'status': 'downloading', -                        'eta': eta, -                        'speed': speed, -                    }) -                else: -                    # no percent for live streams -                    mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) -                    if mobj: -                        downloaded_data_len = int(float(mobj.group(1))*1024) -                        time_now = time.time() -                        speed = self.calc_speed(start, time_now, downloaded_data_len) -                        self.report_progress_live_stream(downloaded_data_len, speed, time_now - start) -                        cursor_in_new_line = False -                        self._hook_progress({ -                            'downloaded_bytes': downloaded_data_len, -                            'tmpfilename': tmpfilename, -                            'filename': filename, -                            'status': 'downloading', -                            'speed': speed, -                        }) -                    elif self.params.get('verbose', False): -                        if not cursor_in_new_line: -                            self.to_screen(u'') -                        cursor_in_new_line = True -                        self.to_screen(u'[rtmpdump] '+line) -            proc.wait() -            if not cursor_in_new_line: -                self.to_screen(u'') -            return proc.returncode - -        self.report_destination(filename) -        tmpfilename = self.temp_name(filename) -        test = self.params.get('test', False) - -        # Check for rtmpdump first -        try: -            subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) -        except (OSError, IOError): -            self.report_error(u'RTMP download detected but "rtmpdump" could not be run') -            return False - -        # Download using rtmpdump. rtmpdump returns exit code 2 when -        # the connection was interrumpted and resuming appears to be -        # possible. This is part of rtmpdump's normal usage, AFAIK. -        basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] -        if player_url is not None: -            basic_args += ['--swfVfy', player_url] -        if page_url is not None: -            basic_args += ['--pageUrl', page_url] -        if play_path is not None: -            basic_args += ['--playpath', play_path] -        if tc_url is not None: -            basic_args += ['--tcUrl', url] -        if test: -            basic_args += ['--stop', '1'] -        if live: -            basic_args += ['--live'] -        if conn: -            basic_args += ['--conn', conn] -        args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] - -        if sys.platform == 'win32' and sys.version_info < (3, 0): -            # Windows subprocess module does not actually support Unicode -            # on Python 2.x -            # See http://stackoverflow.com/a/9951851/35070 -            subprocess_encoding = sys.getfilesystemencoding() -            args = [a.encode(subprocess_encoding, 'ignore') for a in args] -        else: -            subprocess_encoding = None - -        if self.params.get('verbose', False): -            if subprocess_encoding: -                str_args = [ -                    a.decode(subprocess_encoding) if isinstance(a, bytes) else a -                    for a in args] -            else: -                str_args = args -            try: -                import pipes -                shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) -            except ImportError: -                shell_quote = repr -            self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args)) - -        retval = run_rtmpdump(args) - -        while (retval == 2 or retval == 1) and not test: -            prevsize = os.path.getsize(encodeFilename(tmpfilename)) -            self.to_screen(u'[rtmpdump] %s bytes' % prevsize) -            time.sleep(5.0) # This seems to be needed -            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) -            cursize = os.path.getsize(encodeFilename(tmpfilename)) -            if prevsize == cursize and retval == 1: -                break -             # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those -            if prevsize == cursize and retval == 2 and cursize > 1024: -                self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.') -                retval = 0 -                break -        if retval == 0 or (test and retval == 2): -            fsize = os.path.getsize(encodeFilename(tmpfilename)) -            self.to_screen(u'[rtmpdump] %s bytes' % fsize) -            self.try_rename(tmpfilename, filename) -            self._hook_progress({ -                'downloaded_bytes': fsize, -                'total_bytes': fsize, -                'filename': filename, -                'status': 'finished', -            }) -            return True -        else: -            self.to_stderr(u"\n") -            self.report_error(u'rtmpdump exited with code %d' % retval) -            return False - -    def _download_with_mplayer(self, filename, url): -        self.report_destination(filename) -        tmpfilename = self.temp_name(filename) - -        args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url] -        # Check for mplayer first -        try: -            subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) -        except (OSError, IOError): -            self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] ) -            return False - -        # Download using mplayer.  -        retval = subprocess.call(args) -        if retval == 0: -            fsize = os.path.getsize(encodeFilename(tmpfilename)) -            self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) -            self.try_rename(tmpfilename, filename) -            self._hook_progress({ -                'downloaded_bytes': fsize, -                'total_bytes': fsize, -                'filename': filename, -                'status': 'finished', -            }) -            return True -        else: -            self.to_stderr(u"\n") -            self.report_error(u'mplayer exited with code %d' % retval) -            return False - -    def _download_m3u8_with_ffmpeg(self, filename, url): -        self.report_destination(filename) -        tmpfilename = self.temp_name(filename) - -        args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy', -            '-bsf:a', 'aac_adtstoasc', tmpfilename] - -        for program in ['avconv', 'ffmpeg']: -            try: -                subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) -                break -            except (OSError, IOError): -                pass -        else: -            self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found') -        cmd = [program] + args - -        retval = subprocess.call(cmd) -        if retval == 0: -            fsize = os.path.getsize(encodeFilename(tmpfilename)) -            self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) -            self.try_rename(tmpfilename, filename) -            self._hook_progress({ -                'downloaded_bytes': fsize, -                'total_bytes': fsize, -                'filename': filename, -                'status': 'finished', -            }) -            return True -        else: -            self.to_stderr(u"\n") -            self.report_error(u'ffmpeg exited with code %d' % retval) -            return False +# Legacy file for backwards compatibility, use youtube_dl.downloader instead! +from .downloader import FileDownloader as RealFileDownloader +from .downloader import get_suitable_downloader +# This class reproduces the old behaviour of FileDownloader +class FileDownloader(RealFileDownloader):      def _do_download(self, filename, info_dict): -        url = info_dict['url'] - -        # Check file already present -        if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False): -            self.report_file_already_downloaded(filename) -            self._hook_progress({ -                'filename': filename, -                'status': 'finished', -                'total_bytes': os.path.getsize(encodeFilename(filename)), -            }) -            return True - -        # Attempt to download using rtmpdump -        if url.startswith('rtmp'): -            return self._download_with_rtmpdump(filename, url, -                                                info_dict.get('player_url', None), -                                                info_dict.get('page_url', None), -                                                info_dict.get('play_path', None), -                                                info_dict.get('tc_url', None), -                                                info_dict.get('rtmp_live', False), -                                                info_dict.get('rtmp_conn', None)) - -        # Attempt to download using mplayer -        if url.startswith('mms') or url.startswith('rtsp'): -            return self._download_with_mplayer(filename, url) - -        # m3u8 manifest are downloaded with ffmpeg -        if determine_ext(url) == u'm3u8': -            return self._download_m3u8_with_ffmpeg(filename, url) - -        tmpfilename = self.temp_name(filename) -        stream = None - -        # Do not include the Accept-Encoding header -        headers = {'Youtubedl-no-compression': 'True'} -        if 'user_agent' in info_dict: -            headers['Youtubedl-user-agent'] = info_dict['user_agent'] -        basic_request = compat_urllib_request.Request(url, None, headers) -        request = compat_urllib_request.Request(url, None, headers) - -        if self.params.get('test', False): -            request.add_header('Range','bytes=0-10240') - -        # Establish possible resume length -        if os.path.isfile(encodeFilename(tmpfilename)): -            resume_len = os.path.getsize(encodeFilename(tmpfilename)) -        else: -            resume_len = 0 - -        open_mode = 'wb' -        if resume_len != 0: -            if self.params.get('continuedl', False): -                self.report_resuming_byte(resume_len) -                request.add_header('Range','bytes=%d-' % resume_len) -                open_mode = 'ab' -            else: -                resume_len = 0 - -        count = 0 -        retries = self.params.get('retries', 0) -        while count <= retries: -            # Establish connection -            try: -                if count == 0 and 'urlhandle' in info_dict: -                    data = info_dict['urlhandle'] -                data = compat_urllib_request.urlopen(request) -                break -            except (compat_urllib_error.HTTPError, ) as err: -                if (err.code < 500 or err.code >= 600) and err.code != 416: -                    # Unexpected HTTP error -                    raise -                elif err.code == 416: -                    # Unable to resume (requested range not satisfiable) -                    try: -                        # Open the connection again without the range header -                        data = compat_urllib_request.urlopen(basic_request) -                        content_length = data.info()['Content-Length'] -                    except (compat_urllib_error.HTTPError, ) as err: -                        if err.code < 500 or err.code >= 600: -                            raise -                    else: -                        # Examine the reported length -                        if (content_length is not None and -                                (resume_len - 100 < int(content_length) < resume_len + 100)): -                            # The file had already been fully downloaded. -                            # Explanation to the above condition: in issue #175 it was revealed that -                            # YouTube sometimes adds or removes a few bytes from the end of the file, -                            # changing the file size slightly and causing problems for some users. So -                            # I decided to implement a suggested change and consider the file -                            # completely downloaded if the file size differs less than 100 bytes from -                            # the one in the hard drive. -                            self.report_file_already_downloaded(filename) -                            self.try_rename(tmpfilename, filename) -                            self._hook_progress({ -                                'filename': filename, -                                'status': 'finished', -                            }) -                            return True -                        else: -                            # The length does not match, we start the download over -                            self.report_unable_to_resume() -                            open_mode = 'wb' -                            break -            # Retry -            count += 1 -            if count <= retries: -                self.report_retry(count, retries) - -        if count > retries: -            self.report_error(u'giving up after %s retries' % retries) -            return False - -        data_len = data.info().get('Content-length', None) -        if data_len is not None: -            data_len = int(data_len) + resume_len -            min_data_len = self.params.get("min_filesize", None) -            max_data_len =  self.params.get("max_filesize", None) -            if min_data_len is not None and data_len < min_data_len: -                self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) -                return False -            if max_data_len is not None and data_len > max_data_len: -                self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) -                return False - -        data_len_str = format_bytes(data_len) -        byte_counter = 0 + resume_len -        block_size = self.params.get('buffersize', 1024) -        start = time.time() -        while True: -            # Download and write -            before = time.time() -            data_block = data.read(block_size) -            after = time.time() -            if len(data_block) == 0: -                break -            byte_counter += len(data_block) - -            # Open file just in time -            if stream is None: -                try: -                    (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) -                    assert stream is not None -                    filename = self.undo_temp_name(tmpfilename) -                    self.report_destination(filename) -                except (OSError, IOError) as err: -                    self.report_error(u'unable to open for writing: %s' % str(err)) -                    return False -            try: -                stream.write(data_block) -            except (IOError, OSError) as err: -                self.to_stderr(u"\n") -                self.report_error(u'unable to write data: %s' % str(err)) -                return False -            if not self.params.get('noresizebuffer', False): -                block_size = self.best_block_size(after - before, len(data_block)) - -            # Progress message -            speed = self.calc_speed(start, time.time(), byte_counter - resume_len) -            if data_len is None: -                eta = percent = None -            else: -                percent = self.calc_percent(byte_counter, data_len) -                eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) -            self.report_progress(percent, data_len_str, speed, eta) - -            self._hook_progress({ -                'downloaded_bytes': byte_counter, -                'total_bytes': data_len, -                'tmpfilename': tmpfilename, -                'filename': filename, -                'status': 'downloading', -                'eta': eta, -                'speed': speed, -            }) - -            # Apply rate limit -            self.slow_down(start, byte_counter - resume_len) - -        if stream is None: -            self.to_stderr(u"\n") -            self.report_error(u'Did not get any data blocks') -            return False -        stream.close() -        self.report_finish(data_len_str, (time.time() - start)) -        if data_len is not None and byte_counter != data_len: -            raise ContentTooShortError(byte_counter, int(data_len)) -        self.try_rename(tmpfilename, filename) - -        # Update file modification time -        if self.params.get('updatetime', True): -            info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - -        self._hook_progress({ -            'downloaded_bytes': byte_counter, -            'total_bytes': byte_counter, -            'filename': filename, -            'status': 'finished', -        }) - -        return True - -    def _hook_progress(self, status): +        real_fd = get_suitable_downloader(info_dict)(self.ydl, self.params)          for ph in self._progress_hooks: -            ph(status) - -    def add_progress_hook(self, ph): -        """ ph gets called on download progress, with a dictionary with the entries -        * filename: The final filename -        * status: One of "downloading" and "finished" - -        It can also have some of the following entries: - -        * downloaded_bytes: Bytes on disks -        * total_bytes: Total bytes, None if unknown -        * tmpfilename: The filename we're currently writing to -        * eta: The estimated time in seconds, None if unknown -        * speed: The download speed in bytes/second, None if unknown - -        Hooks are guaranteed to be called at least once (with status "finished") -        if the download is successful. -        """ -        self._progress_hooks.append(ph) +            real_fd.add_progress_hook(ph) +        return real_fd.download(filename, info_dict) diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py index da95f1a87..481c07a94 100644 --- a/youtube_dl/PostProcessor.py +++ b/youtube_dl/PostProcessor.py @@ -10,6 +10,7 @@ from .utils import (      PostProcessingError,      shell_quote,      subtitles_filename, +    prepend_extension,  ) @@ -85,10 +86,10 @@ class FFmpegPostProcessor(PostProcessor):          files_cmd = []          for path in input_paths: -            files_cmd.extend(['-i', encodeFilename(path)]) +            files_cmd.extend(['-i', encodeFilename(path, True)])          cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y'] + files_cmd                 + opts + -               [encodeFilename(self._ffmpeg_filename_argument(out_path))]) +               [encodeFilename(self._ffmpeg_filename_argument(out_path), True)])          if self._downloader.params.get('verbose', False):              self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) @@ -122,7 +123,10 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):          if not self._exes['ffprobe'] and not self._exes['avprobe']:              raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.')          try: -            cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', encodeFilename(self._ffmpeg_filename_argument(path))] +            cmd = [ +                self._exes['avprobe'] or self._exes['ffprobe'], +                '-show_streams', +                encodeFilename(self._ffmpeg_filename_argument(path), True)]              handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)              output = handle.communicate()[0]              if handle.wait() != 0: @@ -499,13 +503,11 @@ class FFmpegMetadataPP(FFmpegPostProcessor):              return True, info          filename = info['filepath'] -        ext = os.path.splitext(filename)[1][1:] -        temp_filename = filename + u'.temp' +        temp_filename = prepend_extension(filename, 'temp')          options = ['-c', 'copy']          for (name, value) in metadata.items():              options.extend(['-metadata', '%s=%s' % (name, value)]) -        options.extend(['-f', ext])          self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)          self.run_ffmpeg(filename, temp_filename, options) @@ -514,6 +516,13 @@ class FFmpegMetadataPP(FFmpegPostProcessor):          return True, info +class FFmpegMergerPP(FFmpegPostProcessor): +    def run(self, info): +        filename = info['filepath'] +        args = ['-c', 'copy'] +        self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args) +        return True, info +  class XAttrMetadataPP(PostProcessor):      # diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2a078adfb..5748ceaf3 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1,7 +1,7 @@  #!/usr/bin/env python  # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals  import collections  import errno @@ -51,9 +51,11 @@ from .utils import (      write_json_file,      write_string,      YoutubeDLHandler, +    prepend_extension,  )  from .extractor import get_info_extractor, gen_extractors -from .FileDownloader import FileDownloader +from .downloader import get_suitable_downloader +from .PostProcessor import FFmpegMergerPP  from .version import __version__ @@ -148,6 +150,7 @@ class YoutubeDL(object):      socket_timeout:    Time to wait for unresponsive hosts, in seconds      bidi_workaround:   Work around buggy terminals without bidirectional text                         support, using fridibi +    debug_printtraffic:Print out sent and received HTTP traffic      The following parameters are not used by YoutubeDL itself, they are used by      the FileDownloader: @@ -164,6 +167,8 @@ class YoutubeDL(object):      def __init__(self, params=None):          """Create a FileDownloader object with the given options.""" +        if params is None: +            params = {}          self._ies = []          self._ies_instances = {}          self._pps = [] @@ -172,7 +177,7 @@ class YoutubeDL(object):          self._num_downloads = 0          self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]          self._err_file = sys.stderr -        self.params = {} if params is None else params +        self.params = params          if params.get('bidi_workaround', False):              try: @@ -183,15 +188,21 @@ class YoutubeDL(object):                      width_args = []                  else:                      width_args = ['-w', str(width)] -                self._fribidi = subprocess.Popen( -                    ['fribidi', '-c', 'UTF-8'] + width_args, +                sp_kwargs = dict(                      stdin=subprocess.PIPE,                      stdout=slave,                      stderr=self._err_file) -                self._fribidi_channel = os.fdopen(master, 'rb') +                try: +                    self._output_process = subprocess.Popen( +                        ['bidiv'] + width_args, **sp_kwargs +                    ) +                except OSError: +                    self._output_process = subprocess.Popen( +                        ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) +                self._output_channel = os.fdopen(master, 'rb')              except OSError as ose:                  if ose.errno == 2: -                    self.report_warning(u'Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.') +                    self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')                  else:                      raise @@ -200,15 +211,13 @@ class YoutubeDL(object):                  and not params['restrictfilenames']):              # On Python 3, the Unicode filesystem API will throw errors (#1474)              self.report_warning( -                u'Assuming --restrict-filenames since file system encoding ' -                u'cannot encode all charactes. ' -                u'Set the LC_ALL environment variable to fix this.') +                'Assuming --restrict-filenames since file system encoding ' +                'cannot encode all charactes. ' +                'Set the LC_ALL environment variable to fix this.')              self.params['restrictfilenames'] = True -        self.fd = FileDownloader(self, self.params) -          if '%(stitle)s' in self.params.get('outtmpl', ''): -            self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.') +            self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')          self._setup_opener() @@ -242,17 +251,22 @@ class YoutubeDL(object):          self._pps.append(pp)          pp.set_downloader(self) +    def add_progress_hook(self, ph): +        """Add the progress hook (currently only for the file downloader)""" +        self._progress_hooks.append(ph) +      def _bidi_workaround(self, message): -        if not hasattr(self, '_fribidi_channel'): +        if not hasattr(self, '_output_channel'):              return message -        assert type(message) == type(u'') -        line_count = message.count(u'\n') + 1 -        self._fribidi.stdin.write((message + u'\n').encode('utf-8')) -        self._fribidi.stdin.flush() -        res = u''.join(self._fribidi_channel.readline().decode('utf-8') +        assert hasattr(self, '_output_process') +        assert type(message) == type('') +        line_count = message.count('\n') + 1 +        self._output_process.stdin.write((message + '\n').encode('utf-8')) +        self._output_process.stdin.flush() +        res = ''.join(self._output_channel.readline().decode('utf-8')                         for _ in range(line_count)) -        return res[:-len(u'\n')] +        return res[:-len('\n')]      def to_screen(self, message, skip_eol=False):          """Print message to stdout if not in quiet mode.""" @@ -264,19 +278,19 @@ class YoutubeDL(object):              self.params['logger'].debug(message)          elif not check_quiet or not self.params.get('quiet', False):              message = self._bidi_workaround(message) -            terminator = [u'\n', u''][skip_eol] +            terminator = ['\n', ''][skip_eol]              output = message + terminator              write_string(output, self._screen_file)      def to_stderr(self, message):          """Print message to stderr.""" -        assert type(message) == type(u'') +        assert type(message) == type('')          if self.params.get('logger'):              self.params['logger'].error(message)          else:              message = self._bidi_workaround(message) -            output = message + u'\n' +            output = message + '\n'              write_string(output, self._err_file)      def to_console_title(self, message): @@ -287,21 +301,21 @@ class YoutubeDL(object):              # already of type unicode()              ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))          elif 'TERM' in os.environ: -            write_string(u'\033]0;%s\007' % message, self._screen_file) +            write_string('\033]0;%s\007' % message, self._screen_file)      def save_console_title(self):          if not self.params.get('consoletitle', False):              return          if 'TERM' in os.environ:              # Save the title on stack -            write_string(u'\033[22;0t', self._screen_file) +            write_string('\033[22;0t', self._screen_file)      def restore_console_title(self):          if not self.params.get('consoletitle', False):              return          if 'TERM' in os.environ:              # Restore the title from stack -            write_string(u'\033[23;0t', self._screen_file) +            write_string('\033[23;0t', self._screen_file)      def __enter__(self):          self.save_console_title() @@ -327,13 +341,13 @@ class YoutubeDL(object):          if self.params.get('verbose'):              if tb is None:                  if sys.exc_info()[0]:  # if .trouble has been called from an except block -                    tb = u'' +                    tb = ''                      if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: -                        tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) +                        tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))                      tb += compat_str(traceback.format_exc())                  else:                      tb_data = traceback.format_list(traceback.extract_stack()) -                    tb = u''.join(tb_data) +                    tb = ''.join(tb_data)              self.to_stderr(tb)          if not self.params.get('ignoreerrors', False):              if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: @@ -349,10 +363,10 @@ class YoutubeDL(object):          If stderr is a tty file the 'WARNING:' will be colored          '''          if self._err_file.isatty() and os.name != 'nt': -            _msg_header = u'\033[0;33mWARNING:\033[0m' +            _msg_header = '\033[0;33mWARNING:\033[0m'          else: -            _msg_header = u'WARNING:' -        warning_message = u'%s %s' % (_msg_header, message) +            _msg_header = 'WARNING:' +        warning_message = '%s %s' % (_msg_header, message)          self.to_stderr(warning_message)      def report_error(self, message, tb=None): @@ -361,18 +375,18 @@ class YoutubeDL(object):          in red if stderr is a tty file.          '''          if self._err_file.isatty() and os.name != 'nt': -            _msg_header = u'\033[0;31mERROR:\033[0m' +            _msg_header = '\033[0;31mERROR:\033[0m'          else: -            _msg_header = u'ERROR:' -        error_message = u'%s %s' % (_msg_header, message) +            _msg_header = 'ERROR:' +        error_message = '%s %s' % (_msg_header, message)          self.trouble(error_message, tb)      def report_file_already_downloaded(self, file_name):          """Report file has already been fully downloaded."""          try: -            self.to_screen(u'[download] %s has already been downloaded' % file_name) +            self.to_screen('[download] %s has already been downloaded' % file_name)          except UnicodeEncodeError: -            self.to_screen(u'[download] The file has already been downloaded') +            self.to_screen('[download] The file has already been downloaded')      def increment_downloads(self):          """Increment the ordinal that assigns a number to each file.""" @@ -387,61 +401,61 @@ class YoutubeDL(object):              autonumber_size = self.params.get('autonumber_size')              if autonumber_size is None:                  autonumber_size = 5 -            autonumber_templ = u'%0' + str(autonumber_size) + u'd' +            autonumber_templ = '%0' + str(autonumber_size) + 'd'              template_dict['autonumber'] = autonumber_templ % self._num_downloads              if template_dict.get('playlist_index') is not None: -                template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index'] +                template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']              sanitize = lambda k, v: sanitize_filename(                  compat_str(v),                  restricted=self.params.get('restrictfilenames'), -                is_id=(k == u'id')) +                is_id=(k == 'id'))              template_dict = dict((k, sanitize(k, v))                                   for k, v in template_dict.items()                                   if v is not None) -            template_dict = collections.defaultdict(lambda: u'NA', template_dict) +            template_dict = collections.defaultdict(lambda: 'NA', template_dict)              tmpl = os.path.expanduser(self.params['outtmpl'])              filename = tmpl % template_dict              return filename          except ValueError as err: -            self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')') +            self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')              return None      def _match_entry(self, info_dict):          """ Returns None iff the file should be downloaded """ -        video_title = info_dict.get('title', info_dict.get('id', u'video')) +        video_title = info_dict.get('title', info_dict.get('id', 'video'))          if 'title' in info_dict:              # This can happen when we're just evaluating the playlist              title = info_dict['title']              matchtitle = self.params.get('matchtitle', False)              if matchtitle:                  if not re.search(matchtitle, title, re.IGNORECASE): -                    return u'"' + title + '" title did not match pattern "' + matchtitle + '"' +                    return '"' + title + '" title did not match pattern "' + matchtitle + '"'              rejecttitle = self.params.get('rejecttitle', False)              if rejecttitle:                  if re.search(rejecttitle, title, re.IGNORECASE): -                    return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' +                    return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'          date = info_dict.get('upload_date', None)          if date is not None:              dateRange = self.params.get('daterange', DateRange())              if date not in dateRange: -                return u'%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) +                return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)          view_count = info_dict.get('view_count', None)          if view_count is not None:              min_views = self.params.get('min_views')              if min_views is not None and view_count < min_views: -                return u'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views) +                return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)              max_views = self.params.get('max_views')              if max_views is not None and view_count > max_views: -                return u'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views) +                return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)          age_limit = self.params.get('age_limit')          if age_limit is not None:              if age_limit < info_dict.get('age_limit', 0): -                return u'Skipping "' + title + '" because it is age restricted' +                return 'Skipping "' + title + '" because it is age restricted'          if self.in_download_archive(info_dict): -            return u'%s has already been recorded in archive' % video_title +            return '%s has already been recorded in archive' % video_title          return None      @staticmethod @@ -468,8 +482,8 @@ class YoutubeDL(object):                  continue              if not ie.working(): -                self.report_warning(u'The program functionality for this site has been marked as broken, ' -                                    u'and will probably not work.') +                self.report_warning('The program functionality for this site has been marked as broken, ' +                                    'and will probably not work.')              try:                  ie_result = ie.extract(url) @@ -502,7 +516,7 @@ class YoutubeDL(object):                  else:                      raise          else: -            self.report_error(u'no suitable InfoExtractor: %s' % url) +            self.report_error('no suitable InfoExtractor: %s' % url)      def process_ie_result(self, ie_result, download=True, extra_info={}):          """ @@ -533,7 +547,7 @@ class YoutubeDL(object):              def make_result(embedded_info):                  new_result = ie_result.copy()                  for f in ('_type', 'url', 'ext', 'player_url', 'formats', -                          'entries', 'urlhandle', 'ie_key', 'duration', +                          'entries', 'ie_key', 'duration',                            'subtitles', 'annotations', 'format',                            'thumbnail', 'thumbnails'):                      if f in new_result: @@ -553,7 +567,7 @@ class YoutubeDL(object):          elif result_type == 'playlist':              # We process each entry in the playlist              playlist = ie_result.get('title', None) or ie_result.get('id', None) -            self.to_screen(u'[download] Downloading playlist: %s' % playlist) +            self.to_screen('[download] Downloading playlist: %s' % playlist)              playlist_results = [] @@ -568,11 +582,11 @@ class YoutubeDL(object):              n_entries = len(entries)              self.to_screen( -                u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % +                "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %                  (ie_result['extractor'], playlist, n_all_entries, n_entries))              for i, entry in enumerate(entries, 1): -                self.to_screen(u'[download] Downloading video #%s of %s' % (i, n_entries)) +                self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))                  extra = {                      'playlist': playlist,                      'playlist_index': i + playliststart, @@ -584,7 +598,7 @@ class YoutubeDL(object):                  reason = self._match_entry(entry)                  if reason is not None: -                    self.to_screen(u'[download] ' + reason) +                    self.to_screen('[download] ' + reason)                      continue                  entry_result = self.process_ie_result(entry, @@ -617,7 +631,7 @@ class YoutubeDL(object):          elif format_spec == 'worst':              return available_formats[0]          else: -            extensions = [u'mp4', u'flv', u'webm', u'3gp'] +            extensions = ['mp4', 'flv', 'webm', '3gp']              if format_spec in extensions:                  filter_f = lambda f: f['ext'] == format_spec              else: @@ -636,7 +650,7 @@ class YoutubeDL(object):              info_dict['playlist_index'] = None          # This extractors handle format selection themselves -        if info_dict['extractor'] in [u'youtube', u'Youku']: +        if info_dict['extractor'] in ['Youku']:              if download:                  self.process_info(info_dict)              return info_dict @@ -653,33 +667,32 @@ class YoutubeDL(object):              if format.get('format_id') is None:                  format['format_id'] = compat_str(i)              if format.get('format') is None: -                format['format'] = u'{id} - {res}{note}'.format( +                format['format'] = '{id} - {res}{note}'.format(                      id=format['format_id'],                      res=self.format_resolution(format), -                    note=u' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', +                    note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',                  )              # Automatically determine file extension if missing              if 'ext' not in format:                  format['ext'] = determine_ext(format['url']) -        if self.params.get('listformats', None): -            self.list_formats(info_dict) -            return -          format_limit = self.params.get('format_limit', None)          if format_limit:              formats = list(takewhile_inclusive(                  lambda f: f['format_id'] != format_limit, formats              )) -        if self.params.get('prefer_free_formats'): -            def _free_formats_key(f): -                try: -                    ext_ord = [u'flv', u'mp4', u'webm'].index(f['ext']) -                except ValueError: -                    ext_ord = -1 -                # We only compare the extension if they have the same height and width -                return (f.get('height'), f.get('width'), ext_ord) -            formats = sorted(formats, key=_free_formats_key) + +        # TODO Central sorting goes here + +        if formats[0] is not info_dict:  +            # only set the 'formats' fields if the original info_dict list them +            # otherwise we end up with a circular reference, the first (and unique) +            # element in the 'formats' field in info_dict is info_dict itself,  +            # wich can't be exported to json +            info_dict['formats'] = formats +        if self.params.get('listformats', None): +            self.list_formats(info_dict) +            return          req_format = self.params.get('format', 'best')          if req_format is None: @@ -689,21 +702,35 @@ class YoutubeDL(object):          if req_format in ('-1', 'all'):              formats_to_download = formats          else: -            # We can accept formats requestd in the format: 34/5/best, we pick +            # We can accept formats requested in the format: 34/5/best, we pick              # the first that is available, starting from left              req_formats = req_format.split('/')              for rf in req_formats: -                selected_format = self.select_format(rf, formats) +                if re.match(r'.+?\+.+?', rf) is not None: +                    # Two formats have been requested like '137+139' +                    format_1, format_2 = rf.split('+') +                    formats_info = (self.select_format(format_1, formats), +                        self.select_format(format_2, formats)) +                    if all(formats_info): +                        selected_format = { +                            'requested_formats': formats_info, +                            'format': rf, +                            'ext': formats_info[0]['ext'], +                        } +                    else: +                        selected_format = None +                else: +                    selected_format = self.select_format(rf, formats)                  if selected_format is not None:                      formats_to_download = [selected_format]                      break          if not formats_to_download: -            raise ExtractorError(u'requested format not available', +            raise ExtractorError('requested format not available',                                   expected=True)          if download:              if len(formats_to_download) > 1: -                self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download))) +                self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))              for format in formats_to_download:                  new_info = dict(info_dict)                  new_info.update(format) @@ -721,7 +748,7 @@ class YoutubeDL(object):          info_dict['fulltitle'] = info_dict['title']          if len(info_dict['title']) > 200: -            info_dict['title'] = info_dict['title'][:197] + u'...' +            info_dict['title'] = info_dict['title'][:197] + '...'          # Keep for backwards compatibility          info_dict['stitle'] = info_dict['title'] @@ -731,7 +758,7 @@ class YoutubeDL(object):          reason = self._match_entry(info_dict)          if reason is not None: -            self.to_screen(u'[download] ' + reason) +            self.to_screen('[download] ' + reason)              return          max_downloads = self.params.get('max_downloads') @@ -748,7 +775,7 @@ class YoutubeDL(object):              self.to_stdout(info_dict['id'])          if self.params.get('forceurl', False):              # For RTMP URLs, also include the playpath -            self.to_stdout(info_dict['url'] + info_dict.get('play_path', u'')) +            self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))          if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:              self.to_stdout(info_dict['thumbnail'])          if self.params.get('forcedescription', False) and info_dict.get('description') is not None: @@ -775,37 +802,37 @@ class YoutubeDL(object):              if dn != '' and not os.path.exists(dn):                  os.makedirs(dn)          except (OSError, IOError) as err: -            self.report_error(u'unable to create directory ' + compat_str(err)) +            self.report_error('unable to create directory ' + compat_str(err))              return          if self.params.get('writedescription', False): -            descfn = filename + u'.description' +            descfn = filename + '.description'              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)): -                self.to_screen(u'[info] Video description is already present') +                self.to_screen('[info] Video description is already present')              else:                  try: -                    self.to_screen(u'[info] Writing video description to: ' + descfn) +                    self.to_screen('[info] Writing video description to: ' + descfn)                      with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:                          descfile.write(info_dict['description'])                  except (KeyError, TypeError): -                    self.report_warning(u'There\'s no description to write.') +                    self.report_warning('There\'s no description to write.')                  except (OSError, IOError): -                    self.report_error(u'Cannot write description file ' + descfn) +                    self.report_error('Cannot write description file ' + descfn)                      return          if self.params.get('writeannotations', False): -            annofn = filename + u'.annotations.xml' +            annofn = filename + '.annotations.xml'              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): -                self.to_screen(u'[info] Video annotations are already present') +                self.to_screen('[info] Video annotations are already present')              else:                  try: -                    self.to_screen(u'[info] Writing video annotations to: ' + annofn) +                    self.to_screen('[info] Writing video annotations to: ' + annofn)                      with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:                          annofile.write(info_dict['annotations'])                  except (KeyError, TypeError): -                    self.report_warning(u'There are no annotations to write.') +                    self.report_warning('There are no annotations to write.')                  except (OSError, IOError): -                    self.report_error(u'Cannot write annotations file: ' + annofn) +                    self.report_error('Cannot write annotations file: ' + annofn)                      return          subtitles_are_requested = any([self.params.get('writesubtitles', False), @@ -823,46 +850,45 @@ class YoutubeDL(object):                  try:                      sub_filename = subtitles_filename(filename, sub_lang, sub_format)                      if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)): -                        self.to_screen(u'[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) +                        self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))                      else: -                        self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) +                        self.to_screen('[info] Writing video subtitles to: ' + sub_filename)                          with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:                                  subfile.write(sub)                  except (OSError, IOError): -                    self.report_error(u'Cannot write subtitles file ' + descfn) +                    self.report_error('Cannot write subtitles file ' + descfn)                      return          if self.params.get('writeinfojson', False): -            infofn = os.path.splitext(filename)[0] + u'.info.json' +            infofn = os.path.splitext(filename)[0] + '.info.json'              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): -                self.to_screen(u'[info] Video description metadata is already present') +                self.to_screen('[info] Video description metadata is already present')              else: -                self.to_screen(u'[info] Writing video description metadata as JSON to: ' + infofn) +                self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)                  try: -                    json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle']) -                    write_json_file(json_info_dict, encodeFilename(infofn)) +                    write_json_file(info_dict, encodeFilename(infofn))                  except (OSError, IOError): -                    self.report_error(u'Cannot write metadata to JSON file ' + infofn) +                    self.report_error('Cannot write metadata to JSON file ' + infofn)                      return          if self.params.get('writethumbnail', False):              if info_dict.get('thumbnail') is not None: -                thumb_format = determine_ext(info_dict['thumbnail'], u'jpg') -                thumb_filename = os.path.splitext(filename)[0] + u'.' + thumb_format +                thumb_format = determine_ext(info_dict['thumbnail'], 'jpg') +                thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format                  if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): -                    self.to_screen(u'[%s] %s: Thumbnail is already present' % +                    self.to_screen('[%s] %s: Thumbnail is already present' %                                     (info_dict['extractor'], info_dict['id']))                  else: -                    self.to_screen(u'[%s] %s: Downloading thumbnail ...' % +                    self.to_screen('[%s] %s: Downloading thumbnail ...' %                                     (info_dict['extractor'], info_dict['id']))                      try:                          uf = compat_urllib_request.urlopen(info_dict['thumbnail'])                          with open(thumb_filename, 'wb') as thumbf:                              shutil.copyfileobj(uf, thumbf) -                        self.to_screen(u'[%s] %s: Writing thumbnail to: %s' % +                        self.to_screen('[%s] %s: Writing thumbnail to: %s' %                              (info_dict['extractor'], info_dict['id'], thumb_filename))                      except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                        self.report_warning(u'Unable to download thumbnail "%s": %s' % +                        self.report_warning('Unable to download thumbnail "%s": %s' %                              (info_dict['thumbnail'], compat_str(err)))          if not self.params.get('skip_download', False): @@ -870,21 +896,41 @@ class YoutubeDL(object):                  success = True              else:                  try: -                    success = self.fd._do_download(filename, info_dict) +                    def dl(name, info): +                        fd = get_suitable_downloader(info)(self, self.params) +                        for ph in self._progress_hooks: +                            fd.add_progress_hook(ph) +                        return fd.download(name, info) +                    if info_dict.get('requested_formats') is not None: +                        downloaded = [] +                        success = True +                        for f in info_dict['requested_formats']: +                            new_info = dict(info_dict) +                            new_info.update(f) +                            fname = self.prepare_filename(new_info) +                            fname = prepend_extension(fname, 'f%s' % f['format_id']) +                            downloaded.append(fname) +                            partial_success = dl(fname, new_info) +                            success = success and partial_success +                        info_dict['__postprocessors'] = [FFmpegMergerPP(self)] +                        info_dict['__files_to_merge'] = downloaded +                    else: +                        # Just a single file +                        success = dl(filename, info_dict)                  except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                    self.report_error(u'unable to download video data: %s' % str(err)) +                    self.report_error('unable to download video data: %s' % str(err))                      return                  except (OSError, IOError) as err:                      raise UnavailableVideoError(err)                  except (ContentTooShortError, ) as err: -                    self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) +                    self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))                      return              if success:                  try:                      self.post_process(filename, info_dict)                  except (PostProcessingError) as err: -                    self.report_error(u'postprocessing: %s' % str(err)) +                    self.report_error('postprocessing: %s' % str(err))                      return          self.record_download_archive(info_dict) @@ -901,9 +947,9 @@ class YoutubeDL(object):                  #It also downloads the videos                  self.extract_info(url)              except UnavailableVideoError: -                self.report_error(u'unable to download video') +                self.report_error('unable to download video')              except MaxDownloadsReached: -                self.to_screen(u'[info] Maximum number of downloaded files reached.') +                self.to_screen('[info] Maximum number of downloaded files reached.')                  raise          return self._download_retcode @@ -916,7 +962,7 @@ class YoutubeDL(object):          except DownloadError:              webpage_url = info.get('webpage_url')              if webpage_url is not None: -                self.report_warning(u'The info failed to download, trying with "%s"' % webpage_url) +                self.report_warning('The info failed to download, trying with "%s"' % webpage_url)                  return self.download([webpage_url])              else:                  raise @@ -927,7 +973,11 @@ class YoutubeDL(object):          info = dict(ie_info)          info['filepath'] = filename          keep_video = None -        for pp in self._pps: +        pps_chain = [] +        if ie_info.get('__postprocessors') is not None: +            pps_chain.extend(ie_info['__postprocessors']) +        pps_chain.extend(self._pps) +        for pp in pps_chain:              try:                  keep_video_wish, new_info = pp.run(info)                  if keep_video_wish is not None: @@ -940,10 +990,10 @@ class YoutubeDL(object):                  self.report_error(e.msg)          if keep_video is False and not self.params.get('keepvideo', False):              try: -                self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename) +                self.to_screen('Deleting original file %s (pass -k to keep)' % filename)                  os.remove(encodeFilename(filename))              except (IOError, OSError): -                self.report_warning(u'Unable to remove downloaded video file') +                self.report_warning('Unable to remove downloaded video file')      def _make_archive_id(self, info_dict):          # Future-proof against any change in case @@ -954,7 +1004,7 @@ class YoutubeDL(object):                  extractor = info_dict.get('ie_key')  # key in a playlist          if extractor is None:              return None  # Incomplete video information -        return extractor.lower() + u' ' + info_dict['id'] +        return extractor.lower() + ' ' + info_dict['id']      def in_download_archive(self, info_dict):          fn = self.params.get('download_archive') @@ -982,53 +1032,59 @@ class YoutubeDL(object):          vid_id = self._make_archive_id(info_dict)          assert vid_id          with locked_file(fn, 'a', encoding='utf-8') as archive_file: -            archive_file.write(vid_id + u'\n') +            archive_file.write(vid_id + '\n')      @staticmethod      def format_resolution(format, default='unknown'):          if format.get('vcodec') == 'none':              return 'audio only' -        if format.get('_resolution') is not None: -            return format['_resolution'] +        if format.get('resolution') is not None: +            return format['resolution']          if format.get('height') is not None:              if format.get('width') is not None: -                res = u'%sx%s' % (format['width'], format['height']) +                res = '%sx%s' % (format['width'], format['height'])              else: -                res = u'%sp' % format['height'] +                res = '%sp' % format['height'] +        elif format.get('width') is not None: +            res = '?x%d' % format['width']          else:              res = default          return res      def list_formats(self, info_dict):          def format_note(fdict): -            res = u'' +            res = '' +            if fdict.get('ext') in ['f4f', 'f4m']: +                res += '(unsupported) '              if fdict.get('format_note') is not None: -                res += fdict['format_note'] + u' ' +                res += fdict['format_note'] + ' ' +            if fdict.get('tbr') is not None: +                res += '%4dk ' % fdict['tbr']              if (fdict.get('vcodec') is not None and                      fdict.get('vcodec') != 'none'): -                res += u'%-5s' % fdict['vcodec'] -            elif fdict.get('vbr') is not None: -                res += u'video' +                res += '%-5s@' % fdict['vcodec'] +            elif fdict.get('vbr') is not None and fdict.get('abr') is not None: +                res += 'video@'              if fdict.get('vbr') is not None: -                res += u'@%4dk' % fdict['vbr'] +                res += '%4dk' % fdict['vbr']              if fdict.get('acodec') is not None:                  if res: -                    res += u', ' -                res += u'%-5s' % fdict['acodec'] +                    res += ', ' +                res += '%-5s' % fdict['acodec']              elif fdict.get('abr') is not None:                  if res: -                    res += u', ' +                    res += ', '                  res += 'audio'              if fdict.get('abr') is not None: -                res += u'@%3dk' % fdict['abr'] +                res += '@%3dk' % fdict['abr']              if fdict.get('filesize') is not None:                  if res: -                    res += u', ' +                    res += ', '                  res += format_bytes(fdict['filesize'])              return res          def line(format, idlen=20): -            return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % ( +            return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (                  format['format_id'],                  format['ext'],                  self.format_resolution(format), @@ -1036,7 +1092,7 @@ class YoutubeDL(object):              ))          formats = info_dict.get('formats', [info_dict]) -        idlen = max(len(u'format code'), +        idlen = max(len('format code'),                      max(len(f['format_id']) for f in formats))          formats_s = [line(f, idlen) for f in formats]          if len(formats) > 1: @@ -1044,10 +1100,10 @@ class YoutubeDL(object):              formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'          header_line = line({ -            'format_id': u'format code', 'ext': u'extension', -            '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen) -        self.to_screen(u'[info] Available formats for %s:\n%s\n%s' % -                       (info_dict['id'], header_line, u"\n".join(formats_s))) +            'format_id': 'format code', 'ext': 'extension', +            'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen) +        self.to_screen('[info] Available formats for %s:\n%s\n%s' % +                       (info_dict['id'], header_line, '\n'.join(formats_s)))      def urlopen(self, req):          """ Start an HTTP download """ @@ -1056,7 +1112,7 @@ class YoutubeDL(object):      def print_debug_header(self):          if not self.params.get('verbose'):              return -        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n') +        write_string('[debug] youtube-dl version ' + __version__ + '\n')          try:              sp = subprocess.Popen(                  ['git', 'rev-parse', '--short', 'HEAD'], @@ -1065,20 +1121,20 @@ class YoutubeDL(object):              out, err = sp.communicate()              out = out.decode().strip()              if re.match('[0-9a-f]+', out): -                write_string(u'[debug] Git HEAD: ' + out + u'\n') +                write_string('[debug] Git HEAD: ' + out + '\n')          except:              try:                  sys.exc_clear()              except:                  pass -        write_string(u'[debug] Python version %s - %s' % -                     (platform.python_version(), platform_name()) + u'\n') +        write_string('[debug] Python version %s - %s' % +                     (platform.python_version(), platform_name()) + '\n')          proxy_map = {}          for handler in self._opener.handlers:              if hasattr(handler, 'proxies'):                  proxy_map.update(handler.proxies) -        write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n') +        write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')      def _setup_opener(self):          timeout_val = self.params.get('socket_timeout') @@ -1108,10 +1164,13 @@ class YoutubeDL(object):              if 'http' in proxies and 'https' not in proxies:                  proxies['https'] = proxies['http']          proxy_handler = compat_urllib_request.ProxyHandler(proxies) + +        debuglevel = 1 if self.params.get('debug_printtraffic') else 0          https_handler = make_HTTPS_handler( -            self.params.get('nocheckcertificate', False)) +            self.params.get('nocheckcertificate', False), debuglevel=debuglevel) +        ydlh = YoutubeDLHandler(debuglevel=debuglevel)          opener = compat_urllib_request.build_opener( -            https_handler, proxy_handler, cookie_processor, YoutubeDLHandler()) +            https_handler, proxy_handler, cookie_processor, ydlh)          # Delete the default user-agent header, which would otherwise apply in          # cases where our custom HTTP handler doesn't come into play          # (See https://github.com/rg3/youtube-dl/issues/1309 for details) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 03f98f504..ba243d4d2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -45,6 +45,7 @@ __license__ = 'Public Domain'  import codecs  import getpass +import locale  import optparse  import os  import random @@ -187,16 +188,16 @@ def parseOpts(overrideArguments=None):      general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')      general.add_option(          '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', -        help='Location in the filesystem where youtube-dl can store downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl .') +        help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')      general.add_option(          '--no-cache-dir', action='store_const', const=None, dest='cachedir',          help='Disable filesystem caching')      general.add_option(          '--socket-timeout', dest='socket_timeout', -        type=float, default=None, help=optparse.SUPPRESS_HELP) +        type=float, default=None, help=u'Time to wait before giving up, in seconds')      general.add_option(          '--bidi-workaround', dest='bidi_workaround', action='store_true', -        help=u'Work around terminals that lack bidirectional text support. Requires fribidi executable in PATH') +        help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')      selection.add_option( @@ -335,7 +336,9 @@ def parseOpts(overrideArguments=None):      verbosity.add_option('--youtube-print-sig-code',              action='store_true', dest='youtube_print_sig_code', default=False,              help=optparse.SUPPRESS_HELP) - +    verbosity.add_option('--print-traffic', +            dest='debug_printtraffic', action='store_true', default=False, +            help=optparse.SUPPRESS_HELP)      filesystem.add_option('-t', '--title',              action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -477,6 +480,8 @@ def parseOpts(overrideArguments=None):              write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')              write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')              write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') +            write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' % +                         (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding()))      return parser, opts, args @@ -521,6 +526,8 @@ def _real_main(argv=None):              sys.exit(u'ERROR: batch file could not be read')      all_urls = batchurls + args      all_urls = [url.strip() for url in all_urls] +    _enc = preferredencoding() +    all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]      extractors = gen_extractors() @@ -697,6 +704,7 @@ def _real_main(argv=None):          'proxy': opts.proxy,          'socket_timeout': opts.socket_timeout,          'bidi_workaround': opts.bidi_workaround, +        'debug_printtraffic': opts.debug_printtraffic,      }      with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py new file mode 100644 index 000000000..f19b490f1 --- /dev/null +++ b/youtube_dl/downloader/__init__.py @@ -0,0 +1,23 @@ +from .common import FileDownloader +from .hls import HlsFD +from .http import HttpFD +from .mplayer import MplayerFD +from .rtmp import RtmpFD + +from ..utils import ( +    determine_ext, +) + +def get_suitable_downloader(info_dict): +    """Get the downloader class that can handle the info dict.""" +    url = info_dict['url'] + +    if url.startswith('rtmp'): +        return RtmpFD +    if determine_ext(url) == u'm3u8': +        return HlsFD +    if url.startswith('mms') or url.startswith('rtsp'): +        return MplayerFD +    else: +        return HttpFD + diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py new file mode 100644 index 000000000..10143d56a --- /dev/null +++ b/youtube_dl/downloader/common.py @@ -0,0 +1,317 @@ +import os +import re +import sys +import time + +from ..utils import ( +    encodeFilename, +    timeconvert, +    format_bytes, +) + + +class FileDownloader(object): +    """File Downloader class. + +    File downloader objects are the ones responsible of downloading the +    actual video file and writing it to disk. + +    File downloaders accept a lot of parameters. In order not to saturate +    the object constructor with arguments, it receives a dictionary of +    options instead. + +    Available options: + +    verbose:           Print additional info to stdout. +    quiet:             Do not print messages to stdout. +    ratelimit:         Download speed limit, in bytes/sec. +    retries:           Number of times to retry for HTTP error 5xx +    buffersize:        Size of download buffer in bytes. +    noresizebuffer:    Do not automatically resize the download buffer. +    continuedl:        Try to continue downloads if possible. +    noprogress:        Do not print the progress bar. +    logtostderr:       Log messages to stderr instead of stdout. +    consoletitle:      Display progress in console window's titlebar. +    nopart:            Do not use temporary .part files. +    updatetime:        Use the Last-modified header to set output file timestamps. +    test:              Download only first bytes to test the downloader. +    min_filesize:      Skip files smaller than this size +    max_filesize:      Skip files larger than this size + +    Subclasses of this one must re-define the real_download method. +    """ + +    params = None + +    def __init__(self, ydl, params): +        """Create a FileDownloader object with the given options.""" +        self.ydl = ydl +        self._progress_hooks = [] +        self.params = params + +    @staticmethod +    def format_seconds(seconds): +        (mins, secs) = divmod(seconds, 60) +        (hours, mins) = divmod(mins, 60) +        if hours > 99: +            return '--:--:--' +        if hours == 0: +            return '%02d:%02d' % (mins, secs) +        else: +            return '%02d:%02d:%02d' % (hours, mins, secs) + +    @staticmethod +    def calc_percent(byte_counter, data_len): +        if data_len is None: +            return None +        return float(byte_counter) / float(data_len) * 100.0 + +    @staticmethod +    def format_percent(percent): +        if percent is None: +            return '---.-%' +        return '%6s' % ('%3.1f%%' % percent) + +    @staticmethod +    def calc_eta(start, now, total, current): +        if total is None: +            return None +        dif = now - start +        if current == 0 or dif < 0.001: # One millisecond +            return None +        rate = float(current) / dif +        return int((float(total) - float(current)) / rate) + +    @staticmethod +    def format_eta(eta): +        if eta is None: +            return '--:--' +        return FileDownloader.format_seconds(eta) + +    @staticmethod +    def calc_speed(start, now, bytes): +        dif = now - start +        if bytes == 0 or dif < 0.001: # One millisecond +            return None +        return float(bytes) / dif + +    @staticmethod +    def format_speed(speed): +        if speed is None: +            return '%10s' % '---b/s' +        return '%10s' % ('%s/s' % format_bytes(speed)) + +    @staticmethod +    def best_block_size(elapsed_time, bytes): +        new_min = max(bytes / 2.0, 1.0) +        new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB +        if elapsed_time < 0.001: +            return int(new_max) +        rate = bytes / elapsed_time +        if rate > new_max: +            return int(new_max) +        if rate < new_min: +            return int(new_min) +        return int(rate) + +    @staticmethod +    def parse_bytes(bytestr): +        """Parse a string indicating a byte quantity into an integer.""" +        matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) +        if matchobj is None: +            return None +        number = float(matchobj.group(1)) +        multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) +        return int(round(number * multiplier)) + +    def to_screen(self, *args, **kargs): +        self.ydl.to_screen(*args, **kargs) + +    def to_stderr(self, message): +        self.ydl.to_screen(message) + +    def to_console_title(self, message): +        self.ydl.to_console_title(message) + +    def trouble(self, *args, **kargs): +        self.ydl.trouble(*args, **kargs) + +    def report_warning(self, *args, **kargs): +        self.ydl.report_warning(*args, **kargs) + +    def report_error(self, *args, **kargs): +        self.ydl.report_error(*args, **kargs) + +    def slow_down(self, start_time, byte_counter): +        """Sleep if the download speed is over the rate limit.""" +        rate_limit = self.params.get('ratelimit', None) +        if rate_limit is None or byte_counter == 0: +            return +        now = time.time() +        elapsed = now - start_time +        if elapsed <= 0.0: +            return +        speed = float(byte_counter) / elapsed +        if speed > rate_limit: +            time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) + +    def temp_name(self, filename): +        """Returns a temporary filename for the given filename.""" +        if self.params.get('nopart', False) or filename == u'-' or \ +                (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): +            return filename +        return filename + u'.part' + +    def undo_temp_name(self, filename): +        if filename.endswith(u'.part'): +            return filename[:-len(u'.part')] +        return filename + +    def try_rename(self, old_filename, new_filename): +        try: +            if old_filename == new_filename: +                return +            os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) +        except (IOError, OSError) as err: +            self.report_error(u'unable to rename file: %s' % str(err)) + +    def try_utime(self, filename, last_modified_hdr): +        """Try to set the last-modified time of the given file.""" +        if last_modified_hdr is None: +            return +        if not os.path.isfile(encodeFilename(filename)): +            return +        timestr = last_modified_hdr +        if timestr is None: +            return +        filetime = timeconvert(timestr) +        if filetime is None: +            return filetime +        # Ignore obviously invalid dates +        if filetime == 0: +            return +        try: +            os.utime(filename, (time.time(), filetime)) +        except: +            pass +        return filetime + +    def report_destination(self, filename): +        """Report destination filename.""" +        self.to_screen(u'[download] Destination: ' + filename) + +    def _report_progress_status(self, msg, is_last_line=False): +        fullmsg = u'[download] ' + msg +        if self.params.get('progress_with_newline', False): +            self.to_screen(fullmsg) +        else: +            if os.name == 'nt': +                prev_len = getattr(self, '_report_progress_prev_line_length', +                                   0) +                if prev_len > len(fullmsg): +                    fullmsg += u' ' * (prev_len - len(fullmsg)) +                self._report_progress_prev_line_length = len(fullmsg) +                clear_line = u'\r' +            else: +                clear_line = (u'\r\x1b[K' if sys.stderr.isatty() else u'\r') +            self.to_screen(clear_line + fullmsg, skip_eol=not is_last_line) +        self.to_console_title(u'youtube-dl ' + msg) + +    def report_progress(self, percent, data_len_str, speed, eta): +        """Report download progress.""" +        if self.params.get('noprogress', False): +            return +        if eta is not None: +            eta_str = self.format_eta(eta) +        else: +            eta_str = 'Unknown ETA' +        if percent is not None: +            percent_str = self.format_percent(percent) +        else: +            percent_str = 'Unknown %' +        speed_str = self.format_speed(speed) + +        msg = (u'%s of %s at %s ETA %s' % +               (percent_str, data_len_str, speed_str, eta_str)) +        self._report_progress_status(msg) + +    def report_progress_live_stream(self, downloaded_data_len, speed, elapsed): +        if self.params.get('noprogress', False): +            return +        downloaded_str = format_bytes(downloaded_data_len) +        speed_str = self.format_speed(speed) +        elapsed_str = FileDownloader.format_seconds(elapsed) +        msg = u'%s at %s (%s)' % (downloaded_str, speed_str, elapsed_str) +        self._report_progress_status(msg) + +    def report_finish(self, data_len_str, tot_time): +        """Report download finished.""" +        if self.params.get('noprogress', False): +            self.to_screen(u'[download] Download completed') +        else: +            self._report_progress_status( +                (u'100%% of %s in %s' % +                 (data_len_str, self.format_seconds(tot_time))), +                is_last_line=True) + +    def report_resuming_byte(self, resume_len): +        """Report attempt to resume at given byte.""" +        self.to_screen(u'[download] Resuming download at byte %s' % resume_len) + +    def report_retry(self, count, retries): +        """Report retry in case of HTTP error 5xx""" +        self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) + +    def report_file_already_downloaded(self, file_name): +        """Report file has already been fully downloaded.""" +        try: +            self.to_screen(u'[download] %s has already been downloaded' % file_name) +        except UnicodeEncodeError: +            self.to_screen(u'[download] The file has already been downloaded') + +    def report_unable_to_resume(self): +        """Report it was impossible to resume download.""" +        self.to_screen(u'[download] Unable to resume') + +    def download(self, filename, info_dict): +        """Download to a filename using the info from info_dict +        Return True on success and False otherwise +        """ +        # Check file already present +        if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False): +            self.report_file_already_downloaded(filename) +            self._hook_progress({ +                'filename': filename, +                'status': 'finished', +                'total_bytes': os.path.getsize(encodeFilename(filename)), +            }) +            return True + +        return self.real_download(filename, info_dict) + +    def real_download(self, filename, info_dict): +        """Real download process. Redefine in subclasses.""" +        raise NotImplementedError(u'This method must be implemented by sublcasses') + +    def _hook_progress(self, status): +        for ph in self._progress_hooks: +            ph(status) + +    def add_progress_hook(self, ph): +        """ ph gets called on download progress, with a dictionary with the entries +        * filename: The final filename +        * status: One of "downloading" and "finished" + +        It can also have some of the following entries: + +        * downloaded_bytes: Bytes on disks +        * total_bytes: Total bytes, None if unknown +        * tmpfilename: The filename we're currently writing to +        * eta: The estimated time in seconds, None if unknown +        * speed: The download speed in bytes/second, None if unknown + +        Hooks are guaranteed to be called at least once (with status "finished") +        if the download is successful. +        """ +        self._progress_hooks.append(ph) + diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py new file mode 100644 index 000000000..51e8c4778 --- /dev/null +++ b/youtube_dl/downloader/hls.py @@ -0,0 +1,44 @@ +import os +import subprocess + +from .common import FileDownloader +from ..utils import ( +    encodeFilename, +) + + +class HlsFD(FileDownloader): +    def real_download(self, filename, info_dict): +        url = info_dict['url'] +        self.report_destination(filename) +        tmpfilename = self.temp_name(filename) + +        args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy', +            '-bsf:a', 'aac_adtstoasc', tmpfilename] + +        for program in ['avconv', 'ffmpeg']: +            try: +                subprocess.call([program, '-version'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) +                break +            except (OSError, IOError): +                pass +        else: +            self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found') +        cmd = [program] + args + +        retval = subprocess.call(cmd) +        if retval == 0: +            fsize = os.path.getsize(encodeFilename(tmpfilename)) +            self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) +            self.try_rename(tmpfilename, filename) +            self._hook_progress({ +                'downloaded_bytes': fsize, +                'total_bytes': fsize, +                'filename': filename, +                'status': 'finished', +            }) +            return True +        else: +            self.to_stderr(u"\n") +            self.report_error(u'ffmpeg exited with code %d' % retval) +            return False diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py new file mode 100644 index 000000000..8407727ba --- /dev/null +++ b/youtube_dl/downloader/http.py @@ -0,0 +1,186 @@ +import os +import time + +from .common import FileDownloader +from ..utils import ( +    compat_urllib_request, +    compat_urllib_error, +    ContentTooShortError, + +    encodeFilename, +    sanitize_open, +    format_bytes, +) + + +class HttpFD(FileDownloader): +    def real_download(self, filename, info_dict): +        url = info_dict['url'] +        tmpfilename = self.temp_name(filename) +        stream = None + +        # Do not include the Accept-Encoding header +        headers = {'Youtubedl-no-compression': 'True'} +        if 'user_agent' in info_dict: +            headers['Youtubedl-user-agent'] = info_dict['user_agent'] +        basic_request = compat_urllib_request.Request(url, None, headers) +        request = compat_urllib_request.Request(url, None, headers) + +        if self.params.get('test', False): +            request.add_header('Range','bytes=0-10240') + +        # Establish possible resume length +        if os.path.isfile(encodeFilename(tmpfilename)): +            resume_len = os.path.getsize(encodeFilename(tmpfilename)) +        else: +            resume_len = 0 + +        open_mode = 'wb' +        if resume_len != 0: +            if self.params.get('continuedl', False): +                self.report_resuming_byte(resume_len) +                request.add_header('Range','bytes=%d-' % resume_len) +                open_mode = 'ab' +            else: +                resume_len = 0 + +        count = 0 +        retries = self.params.get('retries', 0) +        while count <= retries: +            # Establish connection +            try: +                data = compat_urllib_request.urlopen(request) +                break +            except (compat_urllib_error.HTTPError, ) as err: +                if (err.code < 500 or err.code >= 600) and err.code != 416: +                    # Unexpected HTTP error +                    raise +                elif err.code == 416: +                    # Unable to resume (requested range not satisfiable) +                    try: +                        # Open the connection again without the range header +                        data = compat_urllib_request.urlopen(basic_request) +                        content_length = data.info()['Content-Length'] +                    except (compat_urllib_error.HTTPError, ) as err: +                        if err.code < 500 or err.code >= 600: +                            raise +                    else: +                        # Examine the reported length +                        if (content_length is not None and +                                (resume_len - 100 < int(content_length) < resume_len + 100)): +                            # The file had already been fully downloaded. +                            # Explanation to the above condition: in issue #175 it was revealed that +                            # YouTube sometimes adds or removes a few bytes from the end of the file, +                            # changing the file size slightly and causing problems for some users. So +                            # I decided to implement a suggested change and consider the file +                            # completely downloaded if the file size differs less than 100 bytes from +                            # the one in the hard drive. +                            self.report_file_already_downloaded(filename) +                            self.try_rename(tmpfilename, filename) +                            self._hook_progress({ +                                'filename': filename, +                                'status': 'finished', +                            }) +                            return True +                        else: +                            # The length does not match, we start the download over +                            self.report_unable_to_resume() +                            open_mode = 'wb' +                            break +            # Retry +            count += 1 +            if count <= retries: +                self.report_retry(count, retries) + +        if count > retries: +            self.report_error(u'giving up after %s retries' % retries) +            return False + +        data_len = data.info().get('Content-length', None) +        if data_len is not None: +            data_len = int(data_len) + resume_len +            min_data_len = self.params.get("min_filesize", None) +            max_data_len =  self.params.get("max_filesize", None) +            if min_data_len is not None and data_len < min_data_len: +                self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) +                return False +            if max_data_len is not None and data_len > max_data_len: +                self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) +                return False + +        data_len_str = format_bytes(data_len) +        byte_counter = 0 + resume_len +        block_size = self.params.get('buffersize', 1024) +        start = time.time() +        while True: +            # Download and write +            before = time.time() +            data_block = data.read(block_size) +            after = time.time() +            if len(data_block) == 0: +                break +            byte_counter += len(data_block) + +            # Open file just in time +            if stream is None: +                try: +                    (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) +                    assert stream is not None +                    filename = self.undo_temp_name(tmpfilename) +                    self.report_destination(filename) +                except (OSError, IOError) as err: +                    self.report_error(u'unable to open for writing: %s' % str(err)) +                    return False +            try: +                stream.write(data_block) +            except (IOError, OSError) as err: +                self.to_stderr(u"\n") +                self.report_error(u'unable to write data: %s' % str(err)) +                return False +            if not self.params.get('noresizebuffer', False): +                block_size = self.best_block_size(after - before, len(data_block)) + +            # Progress message +            speed = self.calc_speed(start, time.time(), byte_counter - resume_len) +            if data_len is None: +                eta = percent = None +            else: +                percent = self.calc_percent(byte_counter, data_len) +                eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) +            self.report_progress(percent, data_len_str, speed, eta) + +            self._hook_progress({ +                'downloaded_bytes': byte_counter, +                'total_bytes': data_len, +                'tmpfilename': tmpfilename, +                'filename': filename, +                'status': 'downloading', +                'eta': eta, +                'speed': speed, +            }) + +            # Apply rate limit +            self.slow_down(start, byte_counter - resume_len) + +        if stream is None: +            self.to_stderr(u"\n") +            self.report_error(u'Did not get any data blocks') +            return False +        stream.close() +        self.report_finish(data_len_str, (time.time() - start)) +        if data_len is not None and byte_counter != data_len: +            raise ContentTooShortError(byte_counter, int(data_len)) +        self.try_rename(tmpfilename, filename) + +        # Update file modification time +        if self.params.get('updatetime', True): +            info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) + +        self._hook_progress({ +            'downloaded_bytes': byte_counter, +            'total_bytes': byte_counter, +            'filename': filename, +            'status': 'finished', +        }) + +        return True diff --git a/youtube_dl/downloader/mplayer.py b/youtube_dl/downloader/mplayer.py new file mode 100644 index 000000000..67e0e4189 --- /dev/null +++ b/youtube_dl/downloader/mplayer.py @@ -0,0 +1,40 @@ +import os +import subprocess + +from .common import FileDownloader +from ..utils import ( +    encodeFilename, +) + + +class MplayerFD(FileDownloader): +    def real_download(self, filename, info_dict): +        url = info_dict['url'] +        self.report_destination(filename) +        tmpfilename = self.temp_name(filename) + +        args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url] +        # Check for mplayer first +        try: +            subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) +        except (OSError, IOError): +            self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] ) +            return False + +        # Download using mplayer.  +        retval = subprocess.call(args) +        if retval == 0: +            fsize = os.path.getsize(encodeFilename(tmpfilename)) +            self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) +            self.try_rename(tmpfilename, filename) +            self._hook_progress({ +                'downloaded_bytes': fsize, +                'total_bytes': fsize, +                'filename': filename, +                'status': 'finished', +            }) +            return True +        else: +            self.to_stderr(u"\n") +            self.report_error(u'mplayer exited with code %d' % retval) +            return False diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py new file mode 100644 index 000000000..b165e396f --- /dev/null +++ b/youtube_dl/downloader/rtmp.py @@ -0,0 +1,178 @@ +import os +import re +import subprocess +import sys +import time + +from .common import FileDownloader +from ..utils import ( +    encodeFilename, +    format_bytes, +) + + +class RtmpFD(FileDownloader): +    def real_download(self, filename, info_dict): +        def run_rtmpdump(args): +            start = time.time() +            resume_percent = None +            resume_downloaded_data_len = None +            proc = subprocess.Popen(args, stderr=subprocess.PIPE) +            cursor_in_new_line = True +            proc_stderr_closed = False +            while not proc_stderr_closed: +                # read line from stderr +                line = u'' +                while True: +                    char = proc.stderr.read(1) +                    if not char: +                        proc_stderr_closed = True +                        break +                    if char in [b'\r', b'\n']: +                        break +                    line += char.decode('ascii', 'replace') +                if not line: +                    # proc_stderr_closed is True +                    continue +                mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) +                if mobj: +                    downloaded_data_len = int(float(mobj.group(1))*1024) +                    percent = float(mobj.group(2)) +                    if not resume_percent: +                        resume_percent = percent +                        resume_downloaded_data_len = downloaded_data_len +                    eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent) +                    speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len) +                    data_len = None +                    if percent > 0: +                        data_len = int(downloaded_data_len * 100 / percent) +                    data_len_str = u'~' + format_bytes(data_len) +                    self.report_progress(percent, data_len_str, speed, eta) +                    cursor_in_new_line = False +                    self._hook_progress({ +                        'downloaded_bytes': downloaded_data_len, +                        'total_bytes': data_len, +                        'tmpfilename': tmpfilename, +                        'filename': filename, +                        'status': 'downloading', +                        'eta': eta, +                        'speed': speed, +                    }) +                else: +                    # no percent for live streams +                    mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) +                    if mobj: +                        downloaded_data_len = int(float(mobj.group(1))*1024) +                        time_now = time.time() +                        speed = self.calc_speed(start, time_now, downloaded_data_len) +                        self.report_progress_live_stream(downloaded_data_len, speed, time_now - start) +                        cursor_in_new_line = False +                        self._hook_progress({ +                            'downloaded_bytes': downloaded_data_len, +                            'tmpfilename': tmpfilename, +                            'filename': filename, +                            'status': 'downloading', +                            'speed': speed, +                        }) +                    elif self.params.get('verbose', False): +                        if not cursor_in_new_line: +                            self.to_screen(u'') +                        cursor_in_new_line = True +                        self.to_screen(u'[rtmpdump] '+line) +            proc.wait() +            if not cursor_in_new_line: +                self.to_screen(u'') +            return proc.returncode + +        url = info_dict['url'] +        player_url = info_dict.get('player_url', None) +        page_url = info_dict.get('page_url', None) +        play_path = info_dict.get('play_path', None) +        tc_url = info_dict.get('tc_url', None) +        live = info_dict.get('rtmp_live', False) +        conn = info_dict.get('rtmp_conn', None) + +        self.report_destination(filename) +        tmpfilename = self.temp_name(filename) +        test = self.params.get('test', False) + +        # Check for rtmpdump first +        try: +            subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) +        except (OSError, IOError): +            self.report_error(u'RTMP download detected but "rtmpdump" could not be run') +            return False + +        # Download using rtmpdump. rtmpdump returns exit code 2 when +        # the connection was interrumpted and resuming appears to be +        # possible. This is part of rtmpdump's normal usage, AFAIK. +        basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename] +        if player_url is not None: +            basic_args += ['--swfVfy', player_url] +        if page_url is not None: +            basic_args += ['--pageUrl', page_url] +        if play_path is not None: +            basic_args += ['--playpath', play_path] +        if tc_url is not None: +            basic_args += ['--tcUrl', url] +        if test: +            basic_args += ['--stop', '1'] +        if live: +            basic_args += ['--live'] +        if conn: +            basic_args += ['--conn', conn] +        args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] + +        if sys.platform == 'win32' and sys.version_info < (3, 0): +            # Windows subprocess module does not actually support Unicode +            # on Python 2.x +            # See http://stackoverflow.com/a/9951851/35070 +            subprocess_encoding = sys.getfilesystemencoding() +            args = [a.encode(subprocess_encoding, 'ignore') for a in args] +        else: +            subprocess_encoding = None + +        if self.params.get('verbose', False): +            if subprocess_encoding: +                str_args = [ +                    a.decode(subprocess_encoding) if isinstance(a, bytes) else a +                    for a in args] +            else: +                str_args = args +            try: +                import pipes +                shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) +            except ImportError: +                shell_quote = repr +            self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args)) + +        retval = run_rtmpdump(args) + +        while (retval == 2 or retval == 1) and not test: +            prevsize = os.path.getsize(encodeFilename(tmpfilename)) +            self.to_screen(u'[rtmpdump] %s bytes' % prevsize) +            time.sleep(5.0) # This seems to be needed +            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) +            cursize = os.path.getsize(encodeFilename(tmpfilename)) +            if prevsize == cursize and retval == 1: +                break +             # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those +            if prevsize == cursize and retval == 2 and cursize > 1024: +                self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.') +                retval = 0 +                break +        if retval == 0 or (test and retval == 2): +            fsize = os.path.getsize(encodeFilename(tmpfilename)) +            self.to_screen(u'[rtmpdump] %s bytes' % fsize) +            self.try_rename(tmpfilename, filename) +            self._hook_progress({ +                'downloaded_bytes': fsize, +                'total_bytes': fsize, +                'filename': filename, +                'status': 'finished', +            }) +            return True +        else: +            self.to_stderr(u"\n") +            self.report_error(u'rtmpdump exited with code %d' % retval) +            return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a39a1e2f4..f1167989e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -28,6 +28,7 @@ from .channel9 import Channel9IE  from .cinemassacre import CinemassacreIE  from .clipfish import ClipfishIE  from .clipsyndicate import ClipsyndicateIE +from .cmt import CMTIE  from .cnn import CNNIE  from .collegehumor import CollegeHumorIE  from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE @@ -79,7 +80,10 @@ from .hotnewhiphop import HotNewHipHopIE  from .howcast import HowcastIE  from .hypem import HypemIE  from .ign import IGNIE, OneUPIE -from .imdb import ImdbIE +from .imdb import ( +    ImdbIE, +    ImdbListIE +)  from .ina import InaIE  from .infoq import InfoQIE  from .instagram import InstagramIE @@ -91,12 +95,18 @@ from .ivi import (  from .jeuxvideo import JeuxVideoIE  from .jukebox import JukeboxIE  from .justintv import JustinTVIE +from .jpopsukitv import JpopsukiIE  from .kankan import KankanIE  from .keezmovies import KeezMoviesIE  from .kickstarter import KickStarterIE  from .keek import KeekIE  from .liveleak import LiveLeakIE  from .livestream import LivestreamIE, LivestreamOriginalIE +from .lynda import ( +    LyndaIE, +    LyndaCourseIE +) +from .macgamestore import MacGameStoreIE  from .mdr import MDRIE  from .metacafe import MetacafeIE  from .metacritic import MetacriticIE @@ -189,6 +199,7 @@ from .vimeo import (      VimeoUserIE,      VimeoAlbumIE,      VimeoGroupsIE, +    VimeoReviewIE,  )  from .vine import VineIE  from .viki import VikiIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index ef5644aa5..e7361ae06 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -110,7 +110,8 @@ class AppleTrailersIE(InfoExtractor):                      'width': format['width'],                      'height': int(format['height']),                  }) -            formats = sorted(formats, key=lambda f: (f['height'], f['width'])) + +            self._sort_formats(formats)              playlist.append({                  '_type': 'video', diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3a32c14c5..15aee2786 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,14 +10,14 @@ from ..utils import (  class BandcampIE(InfoExtractor): -    IE_NAME = u'Bandcamp'      _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'      _TESTS = [{          u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',          u'file': u'1812978515.mp3', -        u'md5': u'cdeb30cdae1921719a3cbcab696ef53c', +        u'md5': u'c557841d5e50261777a6585648adf439',          u'info_dict': { -            u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" +            u"title": u"youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", +            u"duration": 10,          },          u'skip': u'There is a limit of 200 free downloads / month for the test song'      }] @@ -30,29 +30,42 @@ class BandcampIE(InfoExtractor):          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)          if m_download is None:              m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) -        if m_trackinfo: -            json_code = m_trackinfo.group(1) -            data = json.loads(json_code) +            if m_trackinfo: +                json_code = m_trackinfo.group(1) +                data = json.loads(json_code) +                d = data[0] + +                duration = int(round(d['duration'])) +                formats = [] +                for format_id, format_url in d['file'].items(): +                    ext, _, abr_str = format_id.partition('-') + +                    formats.append({ +                        'format_id': format_id, +                        'url': format_url, +                        'ext': format_id.partition('-')[0], +                        'vcodec': 'none', +                        'acodec': format_id.partition('-')[0], +                        'abr': int(format_id.partition('-')[2]), +                    }) + +                self._sort_formats(formats) -            for d in data: -                formats = [{ -                    'format_id': 'format_id', -                    'url': format_url, -                    'ext': format_id.partition('-')[0] -                } for format_id, format_url in sorted(d['file'].items())]                  return {                      'id': compat_str(d['id']),                      'title': d['title'],                      'formats': formats, +                    'duration': duration,                  } -        else: -            raise ExtractorError(u'No free songs found') +            else: +                raise ExtractorError(u'No free songs found')          download_link = m_download.group(1) -        id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',  -                       webpage, re.MULTILINE|re.DOTALL).group('id') +        video_id = re.search( +            r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', +            webpage, re.MULTILINE | re.DOTALL).group('id') -        download_webpage = self._download_webpage(download_link, id, +        download_webpage = self._download_webpage(download_link, video_id,                                                    'Downloading free downloads page')          # We get the dictionary of the track from some javascrip code          info = re.search(r'items: (.*?),$', @@ -66,21 +79,21 @@ class BandcampIE(InfoExtractor):          m_url = re.match(re_url, initial_url)          #We build the url we will use to get the final track url          # This url is build in Bandcamp in the script download_bunde_*.js -        request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) +        request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), video_id, m_url.group('ts'))          final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')          # If we could correctly generate the .rand field the url would be          #in the "download_url" key          final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) -        track_info = {'id':id, -                      'title' : info[u'title'], -                      'ext' :   'mp3', -                      'url' :   final_url, -                      'thumbnail' : info[u'thumb_url'], -                      'uploader' :  info[u'artist'] -                      } - -        return [track_info] +        return { +            'id': video_id, +            'title': info[u'title'], +            'ext': 'mp3', +            'vcodec': 'none', +            'url': final_url, +            'thumbnail': info[u'thumb_url'], +            'uploader': info[u'artist'], +        }  class BandcampAlbumIE(InfoExtractor): @@ -117,7 +130,7 @@ class BandcampAlbumIE(InfoExtractor):          webpage = self._download_webpage(url, title)          tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)          if not tracks_paths: -            raise ExtractorError(u'The page doesn\'t contain any track') +            raise ExtractorError(u'The page doesn\'t contain any tracks')          entries = [              self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())              for t_path in tracks_paths] diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 144ce64cc..0229840a3 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -61,9 +61,10 @@ class BlinkxIE(InfoExtractor):              elif m['type'] in ('flv', 'mp4'):                  vcodec = remove_start(m['vcodec'], 'ff')                  acodec = remove_start(m['acodec'], 'ff') +                tbr = (int(m['vbr']) + int(m['abr'])) // 1000                  format_id = (u'%s-%sk-%s' %                               (vcodec, -                              (int(m['vbr']) + int(m['abr'])) // 1000, +                              tbr,                                m['w']))                  formats.append({                      'format_id': format_id, @@ -72,10 +73,12 @@ class BlinkxIE(InfoExtractor):                      'acodec': acodec,                      'abr': int(m['abr']) // 1000,                      'vbr': int(m['vbr']) // 1000, +                    'tbr': tbr,                      'width': int(m['w']),                      'height': int(m['h']),                  }) -        formats.sort(key=lambda f: (f['width'], f['vbr'], f['abr'])) + +        self._sort_formats(formats)          return {              'id': display_id, diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index 5e33a69df..3ce9b5324 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -1,16 +1,15 @@ +from __future__ import unicode_literals +  import datetime  import json -import os  import re  import socket  from .common import InfoExtractor  from ..utils import (      compat_http_client, -    compat_parse_qs,      compat_str,      compat_urllib_error, -    compat_urllib_parse_urlparse,      compat_urllib_request,      ExtractorError, @@ -22,42 +21,35 @@ class BlipTVIE(InfoExtractor):      """Information extractor for blip.tv"""      _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' -    _URL_EXT = r'^.*\.([a-z0-9]+)$' -    IE_NAME = u'blip.tv' +      _TEST = { -        u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', -        u'file': u'5779306.m4v', -        u'md5': u'80baf1ec5c3d2019037c1c707d676b9f', -        u'info_dict': { -            u"upload_date": u"20111205",  -            u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596",  -            u"uploader": u"Comic Book Resources - CBR TV",  -            u"title": u"CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" +        'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', +        'file': '5779306.mov', +        'md5': 'c6934ad0b6acf2bd920720ec888eb812', +        'info_dict': { +            'upload_date': '20111205', +            'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', +            'uploader': 'Comic Book Resources - CBR TV', +            'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',          }      }      def report_direct_download(self, title):          """Report information extraction.""" -        self.to_screen(u'%s: Direct download detected' % title) +        self.to_screen('%s: Direct download detected' % title)      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) +            raise ExtractorError('Invalid URL: %s' % url)          # See https://github.com/rg3/youtube-dl/issues/857 -        api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url) -        if api_mobj is not None: -            url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') -        urlp = compat_urllib_parse_urlparse(url) -        if urlp.path.startswith('/play/'): -            response = self._request_webpage(url, None, False) -            redirecturl = response.geturl() -            rurlp = compat_urllib_parse_urlparse(redirecturl) -            file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2] -            url = 'http://blip.tv/a/a-' + file_id -            return self._real_extract(url) - +        embed_mobj = re.search(r'^(?:https?://)?(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url) +        if embed_mobj: +            info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1) +            info_page = self._download_webpage(info_url, embed_mobj.group(1)) +            video_id = self._search_regex(r'data-episode-id="(\d+)', info_page,  'video_id') +            return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV')          if '?' in url:              cchar = '&' @@ -67,67 +59,55 @@ class BlipTVIE(InfoExtractor):          request = compat_urllib_request.Request(json_url)          request.add_header('User-Agent', 'iTunes/10.6.1')          self.report_extraction(mobj.group(1)) -        info = None          urlh = self._request_webpage(request, None, False, -            u'unable to download video info webpage') -        if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download -            basename = url.split('/')[-1] -            title,ext = os.path.splitext(basename) -            title = title.decode('UTF-8') -            ext = ext.replace('.', '') -            self.report_direct_download(title) -            info = { -                'id': title, -                'url': url, -                'uploader': None, -                'upload_date': None, -                'title': title, -                'ext': ext, -                'urlhandle': urlh +            'unable to download video info webpage') + +        try: +            json_code_bytes = urlh.read() +            json_code = json_code_bytes.decode('utf-8') +        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: +            raise ExtractorError('Unable to read video info webpage: %s' % compat_str(err)) + +        try: +            json_data = json.loads(json_code) +            if 'Post' in json_data: +                data = json_data['Post'] +            else: +                data = json_data + +            upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') +            formats = [] +            if 'additionalMedia' in data: +                for f in sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])): +                    if not int(f['media_width']): # filter m3u8 +                        continue +                    formats.append({ +                        'url': f['url'], +                        'format_id': f['role'], +                        'width': int(f['media_width']), +                        'height': int(f['media_height']), +                    }) +            else: +                formats.append({ +                    'url': data['media']['url'], +                    'width': int(data['media']['width']), +                    'height': int(data['media']['height']), +                }) + +            self._sort_formats(formats) + +            return { +                'id': compat_str(data['item_id']), +                'uploader': data['display_name'], +                'upload_date': upload_date, +                'title': data['title'], +                'thumbnail': data['thumbnailUrl'], +                'description': data['description'], +                'user_agent': 'iTunes/10.6.1', +                'formats': formats,              } -        if info is None: # Regular URL -            try: -                json_code_bytes = urlh.read() -                json_code = json_code_bytes.decode('utf-8') -            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: -                raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err)) - -            try: -                json_data = json.loads(json_code) -                if 'Post' in json_data: -                    data = json_data['Post'] -                else: -                    data = json_data - -                upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') -                if 'additionalMedia' in data: -                    formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height'])) -                    best_format = formats[-1] -                    video_url = best_format['url'] -                else: -                    video_url = data['media']['url'] -                umobj = re.match(self._URL_EXT, video_url) -                if umobj is None: -                    raise ValueError('Can not determine filename extension') -                ext = umobj.group(1) - -                info = { -                    'id': compat_str(data['item_id']), -                    'url': video_url, -                    'uploader': data['display_name'], -                    'upload_date': upload_date, -                    'title': data['title'], -                    'ext': ext, -                    'format': data['media']['mimeType'], -                    'thumbnail': data['thumbnailUrl'], -                    'description': data['description'], -                    'player_url': data['embedUrl'], -                    'user_agent': 'iTunes/10.6.1', -                } -            except (ValueError,KeyError) as err: -                raise ExtractorError(u'Unable to parse video information: %s' % repr(err)) - -        return [info] +        except (ValueError, KeyError) as err: +            raise ExtractorError('Unable to parse video information: %s' % repr(err))  class BlipTVUserIE(InfoExtractor): @@ -135,19 +115,19 @@ class BlipTVUserIE(InfoExtractor):      _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'      _PAGE_SIZE = 12 -    IE_NAME = u'blip.tv:user' +    IE_NAME = 'blip.tv:user'      def _real_extract(self, url):          # Extract username          mobj = re.match(self._VALID_URL, url)          if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) +            raise ExtractorError('Invalid URL: %s' % url)          username = mobj.group(1)          page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' -        page = self._download_webpage(url, username, u'Downloading user page') +        page = self._download_webpage(url, username, 'Downloading user page')          mobj = re.search(r'data-users-id="([^"]+)"', page)          page_base = page_base % mobj.group(1) @@ -163,7 +143,7 @@ class BlipTVUserIE(InfoExtractor):          while True:              url = page_base + "&page=" + str(pagenum)              page = self._download_webpage(url, username, -                                          u'Downloading video ids from page %d' % pagenum) +                                          'Downloading video ids from page %d' % pagenum)              # Extract video identifiers              ids_in_page = [] @@ -185,6 +165,6 @@ class BlipTVUserIE(InfoExtractor):              pagenum += 1 -        urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] +        urls = ['http://blip.tv/%s' % video_id for video_id in video_ids]          url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]          return [self.playlist_result(url_entries, playlist_title = username)] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f7f0041c0..4ba3f7c42 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,4 +1,5 @@  # encoding: utf-8 +from __future__ import unicode_literals  import re  import json @@ -13,6 +14,7 @@ from ..utils import (      compat_urllib_request,      ExtractorError, +    unsmuggle_url,  ) @@ -24,47 +26,47 @@ class BrightcoveIE(InfoExtractor):      _TESTS = [          {              # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ -            u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', -            u'file': u'2371591881001.mp4', -            u'md5': u'5423e113865d26e40624dce2e4b45d95', -            u'note': u'Test Brightcove downloads and detection in GenericIE', -            u'info_dict': { -                u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', -                u'uploader': u'8TV', -                u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', +            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', +            'file': '2371591881001.mp4', +            'md5': '5423e113865d26e40624dce2e4b45d95', +            'note': 'Test Brightcove downloads and detection in GenericIE', +            'info_dict': { +                'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', +                'uploader': '8TV', +                'description': 'md5:a950cc4285c43e44d763d036710cd9cd',              }          },          {              # From http://medianetwork.oracle.com/video/player/1785452137001 -            u'url': u'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', -            u'file': u'1785452137001.flv', -            u'info_dict': { -                u'title': u'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', -                u'description': u'John Rose speaks at the JVM Language Summit, August 1, 2012.', -                u'uploader': u'Oracle', +            'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', +            'file': '1785452137001.flv', +            'info_dict': { +                'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', +                'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', +                'uploader': 'Oracle',              },          },          {              # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ -            u'url': u'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', -            u'info_dict': { -                u'id': u'2750934548001', -                u'ext': u'mp4', -                u'title': u'This Bracelet Acts as a Personal Thermostat', -                u'description': u'md5:547b78c64f4112766ccf4e151c20b6a0', -                u'uploader': u'Mashable', +            'url': 'http://c.brightcove.com/services/viewer/federated_f9?&playerID=1265504713001&publisherID=AQ%7E%7E%2CAAABBzUwv1E%7E%2CxP-xFHVUstiMFlNYfvF4G9yFnNaqCw_9&videoID=2750934548001', +            'info_dict': { +                'id': '2750934548001', +                'ext': 'mp4', +                'title': 'This Bracelet Acts as a Personal Thermostat', +                'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', +                'uploader': 'Mashable',              },          },          {              # test that the default referer works              # from http://national.ballet.ca/interact/video/Lost_in_Motion_II/ -            u'url': u'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', -            u'info_dict': { -                u'id': u'2878862109001', -                u'ext': u'mp4', -                u'title': u'Lost in Motion II', -                u'description': u'md5:363109c02998fee92ec02211bd8000df', -                u'uploader': u'National Ballet of Canada', +            'url': 'http://link.brightcove.com/services/player/bcpid756015033001?bckey=AQ~~,AAAApYJi_Ck~,GxhXCegT1Dp39ilhXuxMJxasUhVNZiil&bctid=2878862109001', +            'info_dict': { +                'id': '2878862109001', +                'ext': 'mp4', +                'title': 'Lost in Motion II', +                'description': 'md5:363109c02998fee92ec02211bd8000df', +                'uploader': 'National Ballet of Canada',              },          },      ] @@ -80,10 +82,10 @@ class BrightcoveIE(InfoExtractor):          object_str = re.sub(r'(<param name="[^"]+" value="[^"]+")>',                              lambda m: m.group(1) + '/>', object_str)          # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 -        object_str = object_str.replace(u'<--', u'<!--') +        object_str = object_str.replace('<--', '<!--')          object_doc = xml.etree.ElementTree.fromstring(object_str) -        assert u'BrightcoveExperience' in object_doc.attrib['class'] +        assert 'BrightcoveExperience' in object_doc.attrib['class']          params = {'flashID': object_doc.attrib['id'],                    'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],                    } @@ -120,6 +122,8 @@ class BrightcoveIE(InfoExtractor):              return None      def _real_extract(self, url): +        url, smuggled_data = unsmuggle_url(url, {}) +          # Change the 'videoId' and others field to '@videoPlayer'          url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url)          # Change bckey (used by bcove.me urls) to playerKey @@ -130,9 +134,10 @@ class BrightcoveIE(InfoExtractor):          videoPlayer = query.get('@videoPlayer')          if videoPlayer: -            return self._get_video_info(videoPlayer[0], query_str, query, -                # We set the original url as the default 'Referer' header -                referer=url) +            # We set the original url as the default 'Referer' header +            referer = smuggled_data.get('Referer', url) +            return self._get_video_info( +                videoPlayer[0], query_str, query, referer=referer)          else:              player_key = query['playerKey']              return self._get_playlist_info(player_key[0]) @@ -156,11 +161,11 @@ class BrightcoveIE(InfoExtractor):      def _get_playlist_info(self, player_key):          playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, -                                               player_key, u'Downloading playlist information') +                                               player_key, 'Downloading playlist information')          json_data = json.loads(playlist_info)          if 'videoList' not in json_data: -            raise ExtractorError(u'Empty playlist') +            raise ExtractorError('Empty playlist')          playlist_info = json_data['videoList']          videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] @@ -189,5 +194,5 @@ class BrightcoveIE(InfoExtractor):                  'url': video_info['FLVFullLengthURL'],              })          else: -            raise ExtractorError(u'Unable to extract video url for %s' % info['id']) +            raise ExtractorError('Unable to extract video url for %s' % info['id'])          return info diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index ae70ea229..574881b70 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -76,14 +76,18 @@ class Channel9IE(InfoExtractor):              </div>)?                                                # File size part may be missing          '''          # Extract known formats -        formats = [{'url': x.group('url'), -                 'format_id': x.group('quality'), -                 'format_note': x.group('note'), -                 'format': '%s (%s)' % (x.group('quality'), x.group('note')),  -                 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate -                 } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] -        # Sort according to known formats list -        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) +        formats = [{ +            'url': x.group('url'), +            'format_id': x.group('quality'), +            'format_note': x.group('note'), +            'format': u'%s (%s)' % (x.group('quality'), x.group('note')), +            'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate +            'preference': self._known_formats.index(x.group('quality')), +            'vcodec': 'none' if x.group('note') == 'Audio only' else None, +        } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + +        self._sort_formats(formats) +          return formats      def _extract_title(self, html): diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py new file mode 100644 index 000000000..88e0e9aba --- /dev/null +++ b/youtube_dl/extractor/cmt.py @@ -0,0 +1,19 @@ +from .mtv import MTVIE + +class CMTIE(MTVIE): +    IE_NAME = u'cmt.com' +    _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml' +    _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' + +    _TESTS = [ +        { +            u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', +            u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2', +            u'info_dict': { +                u'id': u'989124', +                u'ext': u'mp4', +                u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"', +                u'description': u'Blame It All On My Roots', +            }, +        }, +    ] diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a034bb2fb..ecac5e0e9 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -1,7 +1,10 @@  import re  from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( +    int_or_none, +    parse_duration, +)  class CNNIE(InfoExtractor): @@ -15,6 +18,8 @@ class CNNIE(InfoExtractor):          u'info_dict': {              u'title': u'Nadal wins 8th French Open title',              u'description': u'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', +            u'duration': 135, +            u'upload_date': u'20130609',          },      },      { @@ -35,22 +40,58 @@ class CNNIE(InfoExtractor):          info = self._download_xml(info_url, page_title)          formats = [] +        rex = re.compile(r'''(?x) +            (?P<width>[0-9]+)x(?P<height>[0-9]+) +            (?:_(?P<bitrate>[0-9]+)k)? +        ''')          for f in info.findall('files/file'): -            mf = re.match(r'(\d+)x(\d+)(?:_(.*)k)?',f.attrib['bitrate']) -            if mf is not None: -                formats.append((int(mf.group(1)), int(mf.group(2)), int(mf.group(3) or 0), f.text)) -        formats = sorted(formats) -        (_,_,_, video_path) = formats[-1] -        video_url = 'http://ht.cdn.turner.com/cnn/big%s' % video_path +            video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) +            fdct = { +                'format_id': f.attrib['bitrate'], +                'url': video_url, +            } + +            mf = rex.match(f.attrib['bitrate']) +            if mf: +                fdct['width'] = int(mf.group('width')) +                fdct['height'] = int(mf.group('height')) +                fdct['tbr'] = int_or_none(mf.group('bitrate')) +            else: +                mf = rex.search(f.text) +                if mf: +                    fdct['width'] = int(mf.group('width')) +                    fdct['height'] = int(mf.group('height')) +                    fdct['tbr'] = int_or_none(mf.group('bitrate')) +                else: +                    mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) +                    if mi: +                        if mi.group(1) == 'audio': +                            fdct['vcodec'] = 'none' +                            fdct['ext'] = 'm4a' +                        else: +                            fdct['tbr'] = int(mi.group(1)) + +            formats.append(fdct) + +        self._sort_formats(formats)          thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])          thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] -        return {'id': info.attrib['id'], -                'title': info.find('headline').text, -                'url': video_url, -                'ext': determine_ext(video_url), -                'thumbnail': thumbnails[-1][1], -                'thumbnails': thumbs_dict, -                'description': info.find('description').text, -                } +        metas_el = info.find('metas') +        upload_date = ( +            metas_el.attrib.get('version') if metas_el is not None else None) + +        duration_el = info.find('length') +        duration = parse_duration(duration_el.text) + +        return { +            'id': info.attrib['id'], +            'title': info.find('headline').text, +            'formats': formats, +            'thumbnail': thumbnails[-1][1], +            'thumbnails': thumbs_dict, +            'description': info.find('description').text, +            'duration': duration, +            'upload_date': upload_date, +        } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index b27c1dfc5..d10b7bd0c 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,82 +1,68 @@ +from __future__ import unicode_literals + +import json  import re  from .common import InfoExtractor -from ..utils import ( -    compat_urllib_parse_urlparse, -    determine_ext, - -    ExtractorError, -)  class CollegeHumorIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'      _TESTS = [{ -        u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', -        u'file': u'6902724.mp4', -        u'md5': u'1264c12ad95dca142a9f0bf7968105a0', -        u'info_dict': { -            u'title': u'Comic-Con Cosplay Catastrophe', -            u'description': u'Fans get creative this year at San Diego.  Too creative.  And yes, that\'s really Joss Whedon.', +        'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', +        'file': '6902724.mp4', +        'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd', +        'info_dict': { +            'title': 'Comic-Con Cosplay Catastrophe', +            'description': 'Fans get creative this year at San Diego.  Too', +            'age_limit': 13,          },      },      { -        u'url': u'http://www.collegehumor.com/video/3505939/font-conference', -        u'file': u'3505939.mp4', -        u'md5': u'c51ca16b82bb456a4397987791a835f5', -        u'info_dict': { -            u'title': u'Font Conference', -            u'description': u'This video wasn\'t long enough, so we made it double-spaced.', +        'url': 'http://www.collegehumor.com/video/3505939/font-conference', +        'file': '3505939.mp4', +        'md5': '72fa701d8ef38664a4dbb9e2ab721816', +        'info_dict': { +            'title': 'Font Conference', +            'description': 'This video wasn\'t long enough, so we made it double-spaced.', +            'age_limit': 10,          },      }]      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url)          video_id = mobj.group('videoid') -        info = { -            'id': video_id, -            'uploader': None, -            'upload_date': None, -        } - -        self.report_extraction(video_id) -        xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id -        mdoc = self._download_xml(xmlUrl, video_id, -                                         u'Downloading info XML', -                                         u'Unable to download video info XML') +        jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json' +        data = json.loads(self._download_webpage( +            jsonUrl, video_id, 'Downloading info JSON')) +        vdata = data['video'] -        try: -            videoNode = mdoc.findall('./video')[0] -            youtubeIdNode = videoNode.find('./youtubeID') -            if youtubeIdNode is not None: -                return self.url_result(youtubeIdNode.text, 'Youtube') -            info['description'] = videoNode.findall('./description')[0].text -            info['title'] = videoNode.findall('./caption')[0].text -            info['thumbnail'] = videoNode.findall('./thumbnail')[0].text -            next_url = videoNode.findall('./file')[0].text -        except IndexError: -            raise ExtractorError(u'Invalid metadata XML file') - -        if next_url.endswith(u'manifest.f4m'): -            manifest_url = next_url + '?hdcore=2.10.3' -            adoc = self._download_xml(manifest_url, video_id, -                                         u'Downloading XML manifest', -                                         u'Unable to download video info XML') - -            try: -                video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text -            except IndexError: -                raise ExtractorError(u'Invalid manifest file') -            url_pr = compat_urllib_parse_urlparse(info['thumbnail']) -            info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') -            info['ext'] = 'mp4' +        AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0} +        rating = vdata.get('rating') +        if rating: +            age_limit = AGE_LIMITS.get(rating.lower())          else: -            # Old-style direct links -            info['url'] = next_url -            info['ext'] = determine_ext(info['url']) +            age_limit = None  # None = No idea + +        PREFS = {'high_quality': 2, 'low_quality': 0} +        formats = [] +        for format_key in ('mp4', 'webm'): +            for qname, qurl in vdata[format_key].items(): +                formats.append({ +                    'format_id': format_key + '_' + qname, +                    'url': qurl, +                    'format': format_key, +                    'preference': PREFS.get(qname), +                }) +        self._sort_formats(formats) -        return info +        return { +            'id': video_id, +            'title': vdata['title'], +            'description': vdata.get('description'), +            'thumbnail': vdata.get('thumbnail'), +            'formats': formats, +            'age_limit': age_limit, +        } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index a54ce3ee7..27bd8256e 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -12,7 +12,9 @@ from ..utils import (  class ComedyCentralIE(MTVServicesInfoExtractor): -    _VALID_URL = r'https?://(?:www.)?comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)' +    _VALID_URL = r'''(?x)https?://(?:www.)?comedycentral.com/ +        (video-clips|episodes|cc-studios|video-collections) +        /(?P<title>.*)'''      _FEED_URL = u'http://comedycentral.com/feeds/mrss/'      _TEST = { diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ba46a7bc7..2a5e8076c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -9,6 +9,7 @@ import xml.etree.ElementTree  from ..utils import (      compat_http_client,      compat_urllib_error, +    compat_urllib_parse_urlparse,      compat_str,      clean_html, @@ -37,10 +38,12 @@ class InfoExtractor(object):      id:             Video identifier.      title:          Video title, unescaped. -    Additionally, it must contain either a formats entry or url and ext: +    Additionally, it must contain either a formats entry or a url one: -    formats:        A list of dictionaries for each format available, it must -                    be ordered from worst to best quality. Potential fields: +    formats:        A list of dictionaries for each format available, ordered +                    from worst to best quality. + +                    Potential fields:                      * url        Mandatory. The URL of the video file                      * ext        Will be calculated from url if missing                      * format     A human-readable description of the format @@ -48,23 +51,36 @@ class InfoExtractor(object):                                   Calculated from the format_id, width, height.                                   and format_note fields if missing.                      * format_id  A short description of the format -                                 ("mp4_h264_opus" or "19") +                                 ("mp4_h264_opus" or "19"). +                                Technically optional, but strongly recommended.                      * format_note Additional info about the format                                   ("3D" or "DASH video")                      * width      Width of the video, if known                      * height     Height of the video, if known +                    * resolution Textual description of width and height +                    * tbr        Average bitrate of audio and video in KBit/s                      * abr        Average audio bitrate in KBit/s                      * acodec     Name of the audio codec in use                      * vbr        Average video bitrate in KBit/s                      * vcodec     Name of the video codec in use                      * filesize   The number of bytes, if known in advance                      * player_url SWF Player URL (used for rtmpdump). +                    * protocol   The protocol that will be used for the actual +                                 download, lower-case. +                                 "http", "https", "rtsp", "rtmp" or so. +                    * preference Order number of this format. If this field is +                                 present and not None, the formats get sorted +                                 by this field. +                                 -1 for default (order by other properties), +                                 -2 or smaller for less than default. +                    * quality    Order number of the video quality of this +                                 format, irrespective of the file format. +                                 -1 for default (order by other properties), +                                 -2 or smaller for less than default.      url:            Final video URL.      ext:            Video filename extension.      format:         The video format, defaults to ext (used for --get-format)      player_url:     SWF Player URL (used for rtmpdump). -    urlhandle:      [internal] The urlHandle to be used to download the file, -                    like returned by urllib.request.urlopen      The following fields are optional: @@ -244,6 +260,11 @@ class InfoExtractor(object):              xml_string = transform_source(xml_string)          return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) +    def report_warning(self, msg, video_id=None): +        idstr = u'' if video_id is None else u'%s: ' % video_id +        self._downloader.report_warning( +            u'[%s] %s%s' % (self.IE_NAME, idstr, msg)) +      def to_screen(self, msg):          """Print msg to screen, prefixing it with '[ie_name]'"""          self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) @@ -361,7 +382,7 @@ class InfoExtractor(object):      @staticmethod      def _og_regexes(prop):          content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' -        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) +        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)          template = r'<meta[^>]+?%s[^>]+?%s'          return [              template % (property_re, content_re), @@ -426,6 +447,57 @@ class InfoExtractor(object):          }          return RATING_TABLE.get(rating.lower(), None) +    def _sort_formats(self, formats): +        def _formats_key(f): +            # TODO remove the following workaround +            from ..utils import determine_ext +            if not f.get('ext') and 'url' in f: +                f['ext'] = determine_ext(f['url']) + +            preference = f.get('preference') +            if preference is None: +                proto = f.get('protocol') +                if proto is None: +                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme + +                preference = 0 if proto in ['http', 'https'] else -0.1 +                if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported +                    preference -= 0.5 + +            if f.get('vcodec') == 'none':  # audio only +                if self._downloader.params.get('prefer_free_formats'): +                    ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus'] +                else: +                    ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] +                ext_preference = 0 +                try: +                    audio_ext_preference = ORDER.index(f['ext']) +                except ValueError: +                    audio_ext_preference = -1 +            else: +                if self._downloader.params.get('prefer_free_formats'): +                    ORDER = [u'flv', u'mp4', u'webm'] +                else: +                    ORDER = [u'webm', u'flv', u'mp4'] +                try: +                    ext_preference = ORDER.index(f['ext']) +                except ValueError: +                    ext_preference = -1 +                audio_ext_preference = 0 + +            return ( +                preference, +                f.get('quality') if f.get('quality') is not None else -1, +                f.get('height') if f.get('height') is not None else -1, +                f.get('width') if f.get('width') is not None else -1, +                ext_preference, +                f.get('vbr') if f.get('vbr') is not None else -1, +                f.get('abr') if f.get('abr') is not None else -1, +                audio_ext_preference, +                f.get('filesize') if f.get('filesize') is not None else -1, +                f.get('format_id'), +            ) +        formats.sort(key=_formats_key)  class SearchInfoExtractor(InfoExtractor): diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d5730684d..a2cbd4d8d 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -1,20 +1,25 @@ +from __future__ import unicode_literals + +import json  import re  from .common import InfoExtractor  from ..utils import ( -    compat_urllib_parse, +    unescapeHTML,  ) +  class CSpanIE(InfoExtractor):      _VALID_URL = r'http://www\.c-spanvideo\.org/program/(.*)' +    IE_DESC = 'C-SPAN'      _TEST = { -        u'url': u'http://www.c-spanvideo.org/program/HolderonV', -        u'file': u'315139.flv', -        u'md5': u'74a623266956f69e4df0068ab6c80fe4', -        u'info_dict': { -            u"title": u"Attorney General Eric Holder on Voting Rights Act Decision" +        'url': 'http://www.c-spanvideo.org/program/HolderonV', +        'file': '315139.mp4', +        'md5': '8e44ce11f0f725527daccc453f553eb0', +        'info_dict': { +            'title': 'Attorney General Eric Holder on Voting Rights Act Decision', +            'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in [Shelby County v. Holder] in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',          }, -        u'skip': u'Requires rtmpdump'      }      def _real_extract(self, url): @@ -22,30 +27,22 @@ class CSpanIE(InfoExtractor):          prog_name = mobj.group(1)          webpage = self._download_webpage(url, prog_name)          video_id = self._search_regex(r'programid=(.*?)&', webpage, 'video id') -        data = compat_urllib_parse.urlencode({'programid': video_id, -                                              'dynamic':'1'}) -        info_url = 'http://www.c-spanvideo.org/common/services/flashXml.php?' + data -        video_info = self._download_webpage(info_url, video_id, u'Downloading video info') - -        self.report_extraction(video_id) - -        title = self._html_search_regex(r'<string name="title">(.*?)</string>', -                                        video_info, 'title') -        description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"', -                                              webpage, 'description', -                                              flags=re.MULTILINE|re.DOTALL) - -        url = self._search_regex(r'<string name="URL">(.*?)</string>', -                                 video_info, 'video url') -        url = url.replace('$(protocol)', 'rtmp').replace('$(port)', '443') -        path = self._search_regex(r'<string name="path">(.*?)</string>', -                            video_info, 'rtmp play path') - -        return {'id': video_id, -                'title': title, -                'ext': 'flv', -                'url': url, -                'play_path': path, -                'description': description, -                'thumbnail': self._og_search_thumbnail(webpage), -                } + +        title = self._html_search_regex( +            r'<!-- title -->\n\s*<h1[^>]*>(.*?)</h1>', webpage, 'title') +        description = self._og_search_description(webpage) + +        info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id +        data_json = self._download_webpage( +            info_url, video_id, 'Downloading video info') +        data = json.loads(data_json) + +        url = unescapeHTML(data['video']['files'][0]['path']['#text']) + +        return { +            'id': video_id, +            'title': title, +            'url': url, +            'description': description, +            'thumbnail': self._og_search_thumbnail(webpage), +        } diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index cb7226f82..0b11d1f10 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -4,18 +4,17 @@ import re  from .common import InfoExtractor  from ..utils import ( -    determine_ext,      unified_strdate,  )  class DreiSatIE(InfoExtractor):      IE_NAME = '3sat' -    _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/index\.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' +    _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'      _TEST = {          u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", -        u'file': u'36983.webm', -        u'md5': u'57c97d0469d71cf874f6815aa2b7c944', +        u'file': u'36983.mp4', +        u'md5': u'9dcfe344732808dbfcc901537973c922',          u'info_dict': {              u"title": u"Kaffeeland Schweiz",              u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...",  @@ -52,18 +51,12 @@ class DreiSatIE(InfoExtractor):              'width': int(fe.find('./width').text),              'height': int(fe.find('./height').text),              'url': fe.find('./url').text, -            'ext': determine_ext(fe.find('./url').text),              'filesize': int(fe.find('./filesize').text),              'video_bitrate': int(fe.find('./videoBitrate').text), -            '3sat_qualityname': fe.find('./quality').text,          } for fe in format_els              if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')] -        def _sortkey(format): -            qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname']) -            prefer_http = 1 if 'rtmp' in format['url'] else 0 -            return (qidx, prefer_http, format['video_bitrate']) -        formats.sort(key=_sortkey) +        self._sort_formats(formats)          return {              '_type': 'video', diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a14c98f9..7d0e117de 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1,9 +1,12 @@  # encoding: utf-8 +from __future__ import unicode_literals +  import os  import re  from .common import InfoExtractor +from .youtube import YoutubeIE  from ..utils import (      compat_urllib_error,      compat_urllib_parse, @@ -22,78 +25,78 @@ from .ooyala import OoyalaIE  class GenericIE(InfoExtractor): -    IE_DESC = u'Generic downloader that works on some sites' +    IE_DESC = 'Generic downloader that works on some sites'      _VALID_URL = r'.*' -    IE_NAME = u'generic' +    IE_NAME = 'generic'      _TESTS = [          { -            u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', -            u'file': u'13601338388002.mp4', -            u'md5': u'6e15c93721d7ec9e9ca3fdbf07982cfd', -            u'info_dict': { -                u"uploader": u"www.hodiho.fr", -                u"title": u"R\u00e9gis plante sa Jeep" +            'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', +            'file': '13601338388002.mp4', +            'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd', +            'info_dict': { +                'uploader': 'www.hodiho.fr', +                'title': 'R\u00e9gis plante sa Jeep',              }          },          # embedded vimeo video          { -            u'add_ie': ['Vimeo'], -            u'url': u'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', -            u'file': u'22444065.mp4', -            u'md5': u'2903896e23df39722c33f015af0666e2', -            u'info_dict': { -                u'title': u'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', -                u"uploader_id": u"skillsmatter", -                u"uploader": u"Skills Matter", +            'add_ie': ['Vimeo'], +            'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', +            'file': '22444065.mp4', +            'md5': '2903896e23df39722c33f015af0666e2', +            'info_dict': { +                'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', +                'uploader_id': 'skillsmatter', +                'uploader': 'Skills Matter',              }          },          # bandcamp page with custom domain          { -            u'add_ie': ['Bandcamp'], -            u'url': u'http://bronyrock.com/track/the-pony-mash', -            u'file': u'3235767654.mp3', -            u'info_dict': { -                u'title': u'The Pony Mash', -                u'uploader': u'M_Pallante', +            'add_ie': ['Bandcamp'], +            'url': 'http://bronyrock.com/track/the-pony-mash', +            'file': '3235767654.mp3', +            'info_dict': { +                'title': 'The Pony Mash', +                'uploader': 'M_Pallante',              }, -            u'skip': u'There is a limit of 200 free downloads / month for the test song', +            'skip': 'There is a limit of 200 free downloads / month for the test song',          },          # embedded brightcove video          # it also tests brightcove videos that need to set the 'Referer' in the          # http requests          { -            u'add_ie': ['Brightcove'], -            u'url': u'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', -            u'info_dict': { -                u'id': u'2765128793001', -                u'ext': u'mp4', -                u'title': u'Le cours de bourse : l’analyse technique', -                u'description': u'md5:7e9ad046e968cb2d1114004aba466fd9', -                u'uploader': u'BFM BUSINESS', +            'add_ie': ['Brightcove'], +            'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', +            'info_dict': { +                'id': '2765128793001', +                'ext': 'mp4', +                'title': 'Le cours de bourse : l’analyse technique', +                'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', +                'uploader': 'BFM BUSINESS',              }, -            u'params': { -                u'skip_download': True, +            'params': { +                'skip_download': True,              },          },          # Direct link to a video          { -            u'url': u'http://media.w3.org/2010/05/sintel/trailer.mp4', -            u'file': u'trailer.mp4', -            u'md5': u'67d406c2bcb6af27fa886f31aa934bbe', -            u'info_dict': { -                u'id': u'trailer', -                u'title': u'trailer', -                u'upload_date': u'20100513', +            'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', +            'file': 'trailer.mp4', +            'md5': '67d406c2bcb6af27fa886f31aa934bbe', +            'info_dict': { +                'id': 'trailer', +                'title': 'trailer', +                'upload_date': '20100513',              }          },          # ooyala video          { -            u'url': u'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', -            u'md5': u'5644c6ca5d5782c1d0d350dad9bd840c', -            u'info_dict': { -                u'id': u'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', -                u'ext': u'mp4', -                u'title': u'2cc213299525360.mov', #that's what we get +            'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', +            'md5': '5644c6ca5d5782c1d0d350dad9bd840c', +            'info_dict': { +                'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', +                'ext': 'mp4', +                'title': '2cc213299525360.mov', #that's what we get              },          },      ] @@ -101,12 +104,12 @@ class GenericIE(InfoExtractor):      def report_download_webpage(self, video_id):          """Report webpage download."""          if not self._downloader.params.get('test', False): -            self._downloader.report_warning(u'Falling back on generic information extractor.') +            self._downloader.report_warning('Falling back on generic information extractor.')          super(GenericIE, self).report_download_webpage(video_id)      def report_following_redirect(self, new_url):          """Report information extraction.""" -        self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) +        self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)      def _send_head(self, url):          """Check if it is a redirect, like url shorteners, in case return the new url.""" @@ -152,7 +155,7 @@ class GenericIE(InfoExtractor):          response = opener.open(HEADRequest(url))          if response is None: -            raise ExtractorError(u'Invalid URL protocol') +            raise ExtractorError('Invalid URL protocol')          return response      def _real_extract(self, url): @@ -162,6 +165,8 @@ class GenericIE(InfoExtractor):              return self.url_result('http://' + url)          video_id = os.path.splitext(url.split('/')[-1])[0] +        self.to_screen('%s: Requesting header' % video_id) +          try:              response = self._send_head(url) @@ -184,7 +189,7 @@ class GenericIE(InfoExtractor):                      'formats': [{                          'format_id': m.group('format_id'),                          'url': url, -                        'vcodec': u'none' if m.group('type') == 'audio' else None +                        'vcodec': 'none' if m.group('type') == 'audio' else None                      }],                      'upload_date': upload_date,                  } @@ -198,7 +203,7 @@ class GenericIE(InfoExtractor):          except ValueError:              # since this is the last-resort InfoExtractor, if              # this error is thrown, it'll be thrown here -            raise ExtractorError(u'Failed to download URL: %s' % url) +            raise ExtractorError('Failed to download URL: %s' % url)          self.report_extraction(video_id) @@ -209,18 +214,19 @@ class GenericIE(InfoExtractor):          #   Video Title - Tagline | Site Name          # and so on and so forth; it's just not practical          video_title = self._html_search_regex( -            r'(?s)<title>(.*?)</title>', webpage, u'video title', -            default=u'video') +            r'(?s)<title>(.*?)</title>', webpage, 'video title', +            default='video')          # video uploader is domain name          video_uploader = self._search_regex( -            r'^(?:https?://)?([^/]*)/.*', url, u'video uploader') +            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')          # Look for BrightCove:          bc_url = BrightcoveIE._extract_brightcove_url(webpage)          if bc_url is not None: -            self.to_screen(u'Brightcove video detected.') -            return self.url_result(bc_url, 'Brightcove') +            self.to_screen('Brightcove video detected.') +            surl = smuggle_url(bc_url, {'Referer': url}) +            return self.url_result(surl, 'Brightcove')          # Look for embedded (iframe) Vimeo player          mobj = re.search( @@ -271,16 +277,12 @@ class GenericIE(InfoExtractor):              }          # Look for embedded blip.tv player -        mobj = re.search(r'<meta\s[^>]*https?://api.blip.tv/\w+/redirect/\w+/(\d+)', webpage) +        mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)          if mobj: -            return self.url_result('http://blip.tv/seo/-'+mobj.group(1), 'BlipTV') -        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*https?://(?:\w+\.)?blip.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', webpage) +            return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') +        mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage)          if mobj: -            player_url = 'http://blip.tv/play/%s.x?p=1' % mobj.group(1) -            player_page = self._download_webpage(player_url, mobj.group(1)) -            blip_video_id = self._search_regex(r'data-episode-id="(\d+)', player_page, u'blip_video_id', fatal=False) -            if blip_video_id: -                return self.url_result('http://blip.tv/seo/-'+blip_video_id, 'BlipTV') +            return self.url_result(mobj.group(1), 'BlipTV')          # Look for Bandcamp pages with custom domain          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) @@ -308,6 +310,9 @@ class GenericIE(InfoExtractor):          # Start with something easy: JW Player in SWFObject          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)          if mobj is None: +            # Look for gorilla-vid style embedding +            mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage) +        if mobj is None:              # Broaden the search a little bit              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)          if mobj is None: @@ -327,23 +332,27 @@ class GenericIE(InfoExtractor):              # HTML5 video              mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)          if mobj is None: -            raise ExtractorError(u'Unsupported URL: %s' % url) +            raise ExtractorError('Unsupported URL: %s' % url)          # It's possible that one of the regexes          # matched, but returned an empty group:          if mobj.group(1) is None: -            raise ExtractorError(u'Did not find a valid video URL at %s' % url) +            raise ExtractorError('Did not find a valid video URL at %s' % url)          video_url = mobj.group(1)          video_url = compat_urlparse.urljoin(url, video_url)          video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) +        # Sometimes, jwplayer extraction will result in a YouTube URL +        if YoutubeIE.suitable(video_url): +            return self.url_result(video_url, 'Youtube') +          # here's a fun little line of code for you:          video_id = os.path.splitext(video_id)[0]          return { -            'id':       video_id, -            'url':      video_url, +            'id': video_id, +            'url': video_url,              'uploader': video_uploader, -            'title':    video_title, +            'title': video_title,          } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index e5332cce8..16926b4d3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -55,3 +55,32 @@ class ImdbIE(InfoExtractor):              'description': descr,              'thumbnail': format_info['slate'],          } + +class ImdbListIE(InfoExtractor): +    IE_NAME = u'imdb:list' +    IE_DESC = u'Internet Movie Database lists' +    _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' +     +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        list_id = mobj.group('id') +         +        # RSS XML is sometimes malformed +        rss = self._download_webpage('http://rss.imdb.com/list/%s' % list_id, list_id, u'Downloading list RSS') +        list_title = self._html_search_regex(r'<title>(.*?)</title>', rss, u'list title') +         +        # Export is independent of actual author_id, but returns 404 if no author_id is provided. +        # However, passing dummy author_id seems to be enough. +        csv = self._download_webpage('http://www.imdb.com/list/export?list_id=%s&author_id=ur00000000' % list_id, +                                     list_id, u'Downloading list CSV') +         +        entries = [] +        for item in csv.split('\n')[1:]: +            cols = item.split(',') +            if len(cols) < 2: +                continue +            item_id = cols[1][1:-1] +            if item_id.startswith('vi'): +                entries.append(self.url_result('http://www.imdb.com/video/imdb/%s' % item_id, 'Imdb')) +         +        return self.playlist_result(entries, list_id, list_title)
\ No newline at end of file diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 16a6f73c8..4ddda2f1b 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -5,7 +5,6 @@ from ..utils import (      compat_urlparse,      compat_urllib_parse,      xpath_with_ns, -    determine_ext,  ) @@ -63,13 +62,17 @@ class InternetVideoArchiveIE(InfoExtractor):          for content in item.findall(_bp('media:group/media:content')):              attr = content.attrib              f_url = attr['url'] +            width = int(attr['width']) +            bitrate = int(attr['bitrate']) +            format_id = '%d-%dk' % (width, bitrate)              formats.append({ +                'format_id': format_id,                  'url': f_url, -                'ext': determine_ext(f_url), -                'width': int(attr['width']), -                'bitrate': int(attr['bitrate']), +                'width': width, +                'tbr': bitrate,              }) -        formats = sorted(formats, key=lambda f: f['bitrate']) + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 4bdf55f93..98d1d272a 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -84,14 +84,16 @@ class IviIE(InfoExtractor):          result = video_json[u'result'] -        formats = [{'url': x[u'url'], -                    'format_id': x[u'content_format'] -                    } for x in result[u'files'] if x[u'content_format'] in self._known_formats] -        formats.sort(key=lambda fmt: self._known_formats.index(fmt['format_id'])) - -        if len(formats) == 0: -            self._downloader.report_warning(u'No media links available for %s' % video_id) -            return +        formats = [{ +            'url': x[u'url'], +            'format_id': x[u'content_format'], +            'preference': self._known_formats.index(x[u'content_format']), +        } for x in result[u'files'] if x[u'content_format'] in self._known_formats] + +        self._sort_formats(formats) + +        if not formats: +            raise ExtractorError(u'No media links available for %s' % video_id)          duration = result[u'duration']          compilation = result[u'compilation'] diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py new file mode 100644 index 000000000..aad782578 --- /dev/null +++ b/youtube_dl/extractor/jpopsukitv.py @@ -0,0 +1,73 @@ +# coding=utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( +    int_or_none, +    unified_strdate, +) + + +class JpopsukiIE(InfoExtractor): +    IE_NAME = 'jpopsuki.tv' +    _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/video/(.*?)/(?P<id>\S+)' + +    _TEST = { +        'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771', +        'md5': '88018c0c1a9b1387940e90ec9e7e198e', +        'file': '00be659d23b0b40508169cdee4545771.mp4', +        'info_dict': { +            'id': '00be659d23b0b40508169cdee4545771', +            'title': 'ayumi hamasaki - evolution', +            'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution', +            'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg', +            'uploader': 'plama_chan', +            'uploader_id': '404', +            'upload_date': '20121101' +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id) + +        video_url = 'http://www.jpopsuki.tv' + self._html_search_regex( +            r'<source src="(.*?)" type', webpage, 'video url') + +        video_title = self._og_search_title(webpage) +        description = self._og_search_description(webpage) +        thumbnail = self._og_search_thumbnail(webpage) +        uploader = self._html_search_regex( +            r'<li>from: <a href="/user/view/user/(.*?)/uid/', +            webpage, 'video uploader', fatal=False) +        uploader_id = self._html_search_regex( +            r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)', +            webpage, 'video uploader_id', fatal=False) +        upload_date = self._html_search_regex( +            r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date', +            fatal=False) +        if upload_date is not None: +            upload_date = unified_strdate(upload_date) +        view_count_str = self._html_search_regex( +            r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count', +            fatal=False) +        comment_count_str = self._html_search_regex( +            r'<h2>([0-9]+?) comments</h2>', webpage, 'video comment_count', +            fatal=False) + +        return { +            'id': video_id, +            'url': video_url, +            'title': video_title, +            'description': description, +            'thumbnail': thumbnail, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'upload_date': upload_date, +            'view_count': int_or_none(view_count_str), +            'comment_count': int_or_none(comment_count_str), +        } diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py new file mode 100644 index 000000000..844ba4dcb --- /dev/null +++ b/youtube_dl/extractor/lynda.py @@ -0,0 +1,142 @@ +from __future__ import unicode_literals + +import re +import json + +from .subtitles import SubtitlesInfoExtractor +from .common import InfoExtractor +from ..utils import ExtractorError + + +class LyndaIE(SubtitlesInfoExtractor): +    IE_NAME = 'lynda' +    IE_DESC = 'lynda.com videos' +    _VALID_URL = r'https?://www\.lynda\.com/[^/]+/[^/]+/\d+/(\d+)-\d\.html' + +    _TEST = { +        'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', +        'file': '114408.mp4', +        'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', +        u"info_dict": { +            'title': 'Using the exercise files', +            'duration': 68 +        } +    } +     +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group(1) + +        page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, +                                      video_id, 'Downloading video JSON') +        video_json = json.loads(page) + +        if 'Status' in video_json and video_json['Status'] == 'NotFound': +            raise ExtractorError('Video %s does not exist' % video_id, expected=True) + +        if video_json['HasAccess'] is False: +            raise ExtractorError('Video %s is only available for members' % video_id, expected=True) + +        video_id = video_json['ID'] +        duration = video_json['DurationInSeconds'] +        title = video_json['Title'] + +        formats = [{'url': fmt['Url'], +                    'ext': fmt['Extension'], +                    'width': fmt['Width'], +                    'height': fmt['Height'], +                    'filesize': fmt['FileSize'], +                    'format_id': str(fmt['Resolution']) +                    } for fmt in video_json['Formats']] + +        self._sort_formats(formats) +         +        if self._downloader.params.get('listsubtitles', False): +            self._list_available_subtitles(video_id, page) +            return +         +        subtitles = self._fix_subtitles(self.extract_subtitles(video_id, page)) +         +        return { +            'id': video_id, +            'title': title, +            'duration': duration, +            'subtitles': subtitles, +            'formats': formats +        } +         +    _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'     +     +    def _fix_subtitles(self, subtitles): +        fixed_subtitles = {} +        for k, v in subtitles.items(): +            subs = json.loads(v) +            if len(subs) == 0: +                continue +            srt = '' +            for pos in range(0, len(subs) - 1): +                seq_current = subs[pos]                 +                m_current = re.match(self._TIMECODE_REGEX, seq_current['Timecode']) +                if m_current is None: +                    continue                 +                seq_next = subs[pos+1] +                m_next = re.match(self._TIMECODE_REGEX, seq_next['Timecode']) +                if m_next is None: +                    continue                 +                appear_time = m_current.group('timecode') +                disappear_time = m_next.group('timecode') +                text = seq_current['Caption'] +                srt += '%s\r\n%s --> %s\r\n%s' % (str(pos), appear_time, disappear_time, text) +            if srt: +                fixed_subtitles[k] = srt +        return fixed_subtitles +         +    def _get_available_subtitles(self, video_id, webpage): +        url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id +        sub = self._download_webpage(url, None, note=False) +        sub_json = json.loads(sub) +        return {'en': url} if len(sub_json) > 0 else {} + + +class LyndaCourseIE(InfoExtractor): +    IE_NAME = 'lynda:course' +    IE_DESC = 'lynda.com online courses' + +    # Course link equals to welcome/introduction video link of same course +    # We will recognize it as course link +    _VALID_URL = r'https?://(?:www|m)\.lynda\.com/(?P<coursepath>[^/]+/[^/]+/(?P<courseid>\d+))-\d\.html' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        course_path = mobj.group('coursepath') +        course_id = mobj.group('courseid') + +        page = self._download_webpage('http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, +                                      course_id, 'Downloading course JSON') +        course_json = json.loads(page) + +        if 'Status' in course_json and course_json['Status'] == 'NotFound': +            raise ExtractorError('Course %s does not exist' % course_id, expected=True) + +        unaccessible_videos = 0 +        videos = [] + +        for chapter in course_json['Chapters']: +            for video in chapter['Videos']: +                if video['HasAccess'] is not True: +                    unaccessible_videos += 1 +                    continue +                videos.append(video['ID']) + +        if unaccessible_videos > 0: +            self._downloader.report_warning('%s videos are only available for members and will not be downloaded' % unaccessible_videos) + +        entries = [ +            self.url_result('http://www.lynda.com/%s/%s-4.html' % +                            (course_path, video_id), +                            'Lynda') +            for video_id in videos] + +        course_title = course_json['Title'] + +        return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py new file mode 100644 index 000000000..b818cf50c --- /dev/null +++ b/youtube_dl/extractor/macgamestore.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class MacGameStoreIE(InfoExtractor): +    IE_NAME = 'macgamestore' +    IE_DESC = 'MacGameStore trailers' +    _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)' + +    _TEST = { +        'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', +        'file': '2450.m4v', +        'md5': '8649b8ea684b6666b4c5be736ecddc61', +        'info_dict': { +            'title': 'Crow', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') + +        webpage = self._download_webpage(url, video_id, 'Downloading trailer page') + +        if re.search(r'>Missing Media<', webpage) is not None: +            raise ExtractorError('Trailer %s does not exist' % video_id, expected=True) + +        video_title = self._html_search_regex( +            r'<title>MacGameStore: (.*?) Trailer</title>', webpage, 'title') + +        video_url = self._html_search_regex( +            r'(?s)<div\s+id="video-player".*?href="([^"]+)"\s*>', +            webpage, 'video URL') + +        return { +            'id': video_id, +            'url': video_url, +            'title': video_title +        } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 08ce0647f..7aa0080d7 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -52,10 +52,11 @@ class MDRIE(InfoExtractor):                      'format_id': u'%s-%d' % (media_type, vbr),                  })              formats.append(format) -        formats.sort(key=lambda f: (f.get('vbr'), f['abr']))          if not formats:              raise ExtractorError(u'Could not find any valid formats') +        self._sort_formats(formats) +          return {              'id': video_id,              'title': title, diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 52be9232f..76b717fe5 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -33,8 +33,18 @@ class TechTVMITIE(InfoExtractor):              raw_page, u'base url')          formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,              u'video formats') -        formats = json.loads(formats_json) -        formats = sorted(formats, key=lambda f: f['bitrate']) +        formats_mit = json.loads(formats_json) +        formats = [ +            { +                'format_id': f['label'], +                'url': base_url + f['url'].partition(':')[2], +                'ext': f['url'].partition(':')[0], +                'format': f['label'], +                'width': f['width'], +                'vbr': f['bitrate'], +            } +            for f in formats_mit +        ]          title = get_element_by_id('edit-title', clean_page)          description = clean_html(get_element_by_id('edit-description', clean_page)) @@ -43,8 +53,7 @@ class TechTVMITIE(InfoExtractor):          return {'id': video_id,                  'title': title, -                'url': base_url + formats[-1]['url'].replace('mp4:', ''), -                'ext': 'mp4', +                'formats': formats,                  'description': description,                  'thumbnail': thumbnail,                  } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 125d81551..7c54ea0f4 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -53,7 +53,7 @@ class MixcloudIE(InfoExtractor):          info = json.loads(json_data)          preview_url = self._search_regex(r'data-preview-url="(.+?)"', webpage, u'preview url') -        song_url = preview_url.replace('/previews/', '/cloudcasts/originals/') +        song_url = preview_url.replace('/previews/', '/c/originals/')          template_url = re.sub(r'(stream\d*)', 'stream%d', song_url)          final_song_url = self._get_url(template_url)          if final_song_url is None: diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed11f521a..f1cf41e2d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -129,7 +129,7 @@ class MTVIE(MTVServicesInfoExtractor):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('videoid') -        uri = mobj.group('mgid') +        uri = mobj.groupdict().get('mgid')          if uri is None:              webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 0404e6e43..6d35c7861 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -143,8 +143,10 @@ class MyVideoIE(InfoExtractor):          if mobj:              video_url = compat_urllib_parse.unquote(mobj.group(1))              if 'myvideo2flash' in video_url: -                self._downloader.report_warning(u'forcing RTMPT ...') -                video_url = video_url.replace('rtmpe://', 'rtmpt://') +                self.report_warning( +                    u'Rewriting URL to use unencrypted rtmp:// ...', +                    video_id) +                video_url = video_url.replace('rtmpe://', 'rtmp://')          if not video_url:              # extract non rtmp videos diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index b42eae89a..88f03608b 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -1,54 +1,98 @@  # coding: utf-8 +from __future__ import unicode_literals -import re -import xml.etree.ElementTree  import json +import re  from .common import InfoExtractor  from ..utils import ( -    compat_urlparse, -    ExtractorError, -    find_xpath_attr, +    HEADRequest, +    unified_strdate,  ) +  class ORFIE(InfoExtractor): -    _VALID_URL = r'https?://tvthek\.orf\.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)' +    _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)' + +    _TEST = { +        'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747', +        'file': '7319747.mp4', +        'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375', +        'info_dict': { +            'title': 'Was Sie schon immer über Klassik wissen wollten', +            'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4', +            'duration': 3508, +            'upload_date': '20140105', +        }, +        'skip': 'Blocked outside of Austria', +    }      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url)          playlist_id = mobj.group('id')          webpage = self._download_webpage(url, playlist_id) -        flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml') -        flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0] -        flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8')) -        playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"') -        playlist = json.loads(playlist_json) - -        videos = [] -        ns = '{http://tempuri.org/XMLSchema.xsd}' -        xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns} -        webpage_description = self._og_search_description(webpage) -        for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1): -            # Get best quality url -            rtmp_url = None -            for q in ['Q6A', 'Q4A', 'Q1A']: -                video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q) -                if video_url is not None: -                    rtmp_url = video_url.text -                    break -            if rtmp_url is None: -                raise ExtractorError(u'Couldn\'t get video url: %s' % info['id']) -            description = self._html_search_regex( -                r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage, -                u'description', default=webpage_description, flags=re.DOTALL) -            videos.append({ +        data_json = self._search_regex( +            r'initializeAdworx\((.+?)\);\n', webpage, 'video info') +        all_data = json.loads(data_json) +        sdata = all_data[0]['values']['segments'] + +        def quality_to_int(s): +            m = re.search('([0-9]+)', s) +            if m is None: +                return -1 +            return int(m.group(1)) + +        entries = [] +        for sd in sdata: +            video_id = sd['id'] +            formats = [{ +                'preference': -10 if fd['delivery'] == 'hls' else None, +                'format_id': '%s-%s-%s' % ( +                    fd['delivery'], fd['quality'], fd['quality_string']), +                'url': fd['src'], +                'protocol': fd['protocol'], +                'quality': quality_to_int(fd['quality']), +            } for fd in sd['playlist_item_array']['sources']] + +            # Check for geoblocking. +            # There is a property is_geoprotection, but that's always false +            geo_str = sd.get('geoprotection_string') +            if geo_str: +                try: +                    http_url = next( +                        f['url'] +                        for f in formats +                        if re.match(r'^https?://.*\.mp4$', f['url'])) +                except StopIteration: +                    pass +                else: +                    req = HEADRequest(http_url) +                    response = self._request_webpage( +                        req, video_id, +                        note='Testing for geoblocking', +                        errnote=(( +                            'This video seems to be blocked outside of %s. ' +                            'You may want to try the streaming-* formats.') +                            % geo_str), +                        fatal=False) + +            self._sort_formats(formats) + +            upload_date = unified_strdate(sd['created_date']) +            entries.append({                  '_type': 'video', -                'id': info['id'], -                'title': info['title'], -                'url': rtmp_url, -                'ext': 'flv', -                'description': description, -                }) - -        return videos +                'id': video_id, +                'title': sd['header'], +                'formats': formats, +                'description': sd.get('description'), +                'duration': int(sd['duration_in_seconds']), +                'upload_date': upload_date, +                'thumbnail': sd.get('image_full_url'), +            }) + +        return { +            '_type': 'playlist', +            'entries': entries, +            'id': playlist_id, +        } diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 71abd5013..e9ff8d1af 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -5,7 +5,7 @@ from ..utils import compat_urllib_parse  class PornHdIE(InfoExtractor): -    _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)' +    _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'      _TEST = {          u'url': u'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',          u'file': u'1962.flv', diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index a589a893b..99f5b19d2 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -1,5 +1,6 @@  # encoding: utf-8 +import os.path  import re  import json  import hashlib @@ -10,6 +11,7 @@ from ..utils import (      compat_urllib_parse,      compat_urllib_request,      ExtractorError, +    url_basename,  ) @@ -132,7 +134,16 @@ class SmotriIE(InfoExtractor):          # We will extract some from the video web page instead          video_page_url = 'http://' + mobj.group('url')          video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page') -         + +        # Warning if video is unavailable +        warning = self._html_search_regex( +            r'<div class="videoUnModer">(.*?)</div>', video_page, +            u'warning message', default=None) +        if warning is not None: +            self._downloader.report_warning( +                u'Video %s may not be available; smotri said: %s ' % +                (video_id, warning)) +          # Adult content          if re.search(u'EroConfirmText">', video_page) is not None:              self.report_age_confirmation() @@ -148,38 +159,44 @@ class SmotriIE(InfoExtractor):          # Extract the rest of meta data          video_title = self._search_meta(u'name', video_page, u'title')          if not video_title: -            video_title = video_url.rsplit('/', 1)[-1] +            video_title = os.path.splitext(url_basename(video_url))[0]          video_description = self._search_meta(u'description', video_page)          END_TEXT = u' на сайте Smotri.com' -        if video_description.endswith(END_TEXT): +        if video_description and video_description.endswith(END_TEXT):              video_description = video_description[:-len(END_TEXT)]          START_TEXT = u'Смотреть онлайн ролик ' -        if video_description.startswith(START_TEXT): +        if video_description and video_description.startswith(START_TEXT):              video_description = video_description[len(START_TEXT):]          video_thumbnail = self._search_meta(u'thumbnail', video_page)          upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date') -        upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) -        video_upload_date = ( -            ( -                upload_date_m.group('year') + -                upload_date_m.group('month') + -                upload_date_m.group('day') +        if upload_date_str: +            upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str) +            video_upload_date = ( +                ( +                    upload_date_m.group('year') + +                    upload_date_m.group('month') + +                    upload_date_m.group('day') +                ) +                if upload_date_m else None              ) -            if upload_date_m else None -        ) +        else: +            video_upload_date = None          duration_str = self._search_meta(u'duration', video_page) -        duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) -        video_duration = ( -            ( -                (int(duration_m.group('hours')) * 60 * 60) + -                (int(duration_m.group('minutes')) * 60) + -                int(duration_m.group('seconds')) +        if duration_str: +            duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str) +            video_duration = ( +                ( +                    (int(duration_m.group('hours')) * 60 * 60) + +                    (int(duration_m.group('minutes')) * 60) + +                    int(duration_m.group('seconds')) +                ) +                if duration_m else None              ) -            if duration_m else None -        ) +        else: +            video_duration = None          video_uploader = self._html_search_regex(              u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index e22ff9c38..951e977bd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor):                              (?!sets/)(?P<title>[\w\d-]+)/?                              (?P<token>[^?]+?)?(?:[?].*)?$)                         |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) -                       |(?P<widget>w\.soundcloud\.com/player/?.*?url=.*) +                       |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)                      )                      '''      IE_NAME = u'soundcloud' @@ -193,7 +193,7 @@ class SoundcloudIE(InfoExtractor):          if track_id is not None:              info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID              full_title = track_id -        elif mobj.group('widget'): +        elif mobj.group('player'):              query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)              return self.url_result(query['url'][0], ie='Soundcloud')          else: diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 695520524..051a34d5b 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -51,9 +51,10 @@ class SpiegelIE(InfoExtractor):              # Blacklist type 6, it's extremely LQ and not available on the same server              if n.tag.startswith('type') and n.tag != 'type6'          ] -        formats.sort(key=lambda f: f['vbr'])          duration = float(idoc[0].findall('./duration')[0].text) +        self._sort_formats(formats) +          info = {              'id': video_id,              'title': video_title, diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index cec65261b..23172143e 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -55,15 +55,21 @@ class ThePlatformIE(InfoExtractor):          formats = []          for f in switch.findall(_x('smil:video')):              attr = f.attrib +            width = int(attr['width']) +            height = int(attr['height']) +            vbr = int(attr['system-bitrate']) // 1000 +            format_id = '%dx%d_%dk' % (width, height, vbr)              formats.append({ +                'format_id': format_id,                  'url': base_url,                  'play_path': 'mp4:' + attr['src'],                  'ext': 'flv', -                'width': int(attr['width']), -                'height': int(attr['height']), -                'vbr': int(attr['system-bitrate']), +                'width': width, +                'height': height, +                'vbr': vbr,              }) -        formats.sort(key=lambda f: (f['height'], f['width'], f['vbr'])) + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py index 3cf8c853d..b1c854a64 100644 --- a/youtube_dl/extractor/veehd.py +++ b/youtube_dl/extractor/veehd.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals +  import re  import json @@ -8,16 +10,17 @@ from ..utils import (      clean_html,  ) +  class VeeHDIE(InfoExtractor):      _VALID_URL = r'https?://veehd\.com/video/(?P<id>\d+)'      _TEST = { -        u'url': u'http://veehd.com/video/4686958', -        u'file': u'4686958.mp4', -        u'info_dict': { -            u'title': u'Time Lapse View from Space ( ISS)', -            u'uploader_id': u'spotted', -            u'description': u'md5:f0094c4cf3a72e22bc4e4239ef767ad7', +        'url': 'http://veehd.com/video/4686958', +        'file': '4686958.mp4', +        'info_dict': { +            'title': 'Time Lapse View from Space ( ISS)', +            'uploader_id': 'spotted', +            'description': 'md5:f0094c4cf3a72e22bc4e4239ef767ad7',          },      } @@ -25,24 +28,30 @@ class VeeHDIE(InfoExtractor):          mobj = re.match(self._VALID_URL, url)          video_id = mobj.group('id') +        # VeeHD seems to send garbage on the first request. +        # See https://github.com/rg3/youtube-dl/issues/2102 +        self._download_webpage(url, video_id, 'Requesting webpage')          webpage = self._download_webpage(url, video_id) -        player_path = self._search_regex(r'\$\("#playeriframe"\).attr\({src : "(.+?)"', -            webpage, u'player path') +        player_path = self._search_regex( +            r'\$\("#playeriframe"\).attr\({src : "(.+?)"', +            webpage, 'player path')          player_url = compat_urlparse.urljoin(url, player_path) -        player_page = self._download_webpage(player_url, video_id, -            u'Downloading player page') -        config_json = self._search_regex(r'value=\'config=({.+?})\'', -            player_page, u'config json') + +        self._download_webpage(player_url, video_id, 'Requesting player page') +        player_page = self._download_webpage( +            player_url, video_id, 'Downloading player page') +        config_json = self._search_regex( +            r'value=\'config=({.+?})\'', player_page, 'config json')          config = json.loads(config_json)          video_url = compat_urlparse.unquote(config['clip']['url'])          title = clean_html(get_element_by_id('videoName', webpage).rpartition('|')[0])          uploader_id = self._html_search_regex(r'<a href="/profile/\d+">(.+?)</a>', -            webpage, u'uploader') +            webpage, 'uploader')          thumbnail = self._search_regex(r'<img id="veehdpreview" src="(.+?)"', -            webpage, u'thumbnail') +            webpage, 'thumbnail')          description = self._html_search_regex(r'<td class="infodropdown".*?<div>(.*?)<ul', -            webpage, u'description', flags=re.DOTALL) +            webpage, 'description', flags=re.DOTALL)          return {              '_type': 'video', diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 00672c9e5..baa57f343 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -1,22 +1,22 @@ +from __future__ import unicode_literals +  import re  import json  from .common import InfoExtractor -from ..utils import ( -    determine_ext, -) +  class VeohIE(InfoExtractor): -    _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)' +    _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'      _TEST = { -        u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3', -        u'file': u'56314296.mp4', -        u'md5': u'620e68e6a3cff80086df3348426c9ca3', -        u'info_dict': { -            u'title': u'Straight Backs Are Stronger', -            u'uploader': u'LUMOback', -            u'description': u'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ', +        'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', +        'file': '56314296.mp4', +        'md5': '620e68e6a3cff80086df3348426c9ca3', +        'info_dict': { +            'title': 'Straight Backs Are Stronger', +            'uploader': 'LUMOback', +            'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',          }      } @@ -28,20 +28,20 @@ class VeohIE(InfoExtractor):          m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)          if m_youtube is not None:              youtube_id = m_youtube.group(1) -            self.to_screen(u'%s: detected Youtube video.' % video_id) +            self.to_screen('%s: detected Youtube video.' % video_id)              return self.url_result(youtube_id, 'Youtube')          self.report_extraction(video_id)          info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')          info = json.loads(info) -        video_url =  info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath') - -        return {'id': info['videoId'],  -                'title': info['title'], -                'ext': determine_ext(video_url), -                'url': video_url, -                'uploader': info['username'], -                'thumbnail': info.get('highResImage') or info.get('medResImage'), -                'description': info['description'], -                'view_count': info['views'], -                } +        video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath') + +        return { +            'id': info['videoId'], +            'title': info['title'], +            'url': video_url, +            'uploader': info['username'], +            'thumbnail': info.get('highResImage') or info.get('medResImage'), +            'description': info['description'], +            'view_count': info['views'], +        } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c3623fcbe..ad86d033a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -1,4 +1,6 @@  # encoding: utf-8 +from __future__ import unicode_literals +  import json  import re  import itertools @@ -31,54 +33,55 @@ class VimeoIE(InfoExtractor):          (?P<id>[0-9]+)          /?(?:[?&].*)?(?:[#].*)?$'''      _NETRC_MACHINE = 'vimeo' -    IE_NAME = u'vimeo' +    IE_NAME = 'vimeo'      _TESTS = [          { -            u'url': u'http://vimeo.com/56015672#at=0', -            u'file': u'56015672.mp4', -            u'md5': u'8879b6cc097e987f02484baf890129e5', -            u'info_dict': { -                u"upload_date": u"20121220",  -                u"description": u"This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",  -                u"uploader_id": u"user7108434",  -                u"uploader": u"Filippo Valsorda",  -                u"title": u"youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", +            'url': 'http://vimeo.com/56015672#at=0', +            'file': '56015672.mp4', +            'md5': '8879b6cc097e987f02484baf890129e5', +            'info_dict': { +                "upload_date": "20121220",  +                "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",  +                "uploader_id": "user7108434",  +                "uploader": "Filippo Valsorda",  +                "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",              },          },          { -            u'url': u'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', -            u'file': u'68093876.mp4', -            u'md5': u'3b5ca6aa22b60dfeeadf50b72e44ed82', -            u'note': u'Vimeo Pro video (#1197)', -            u'info_dict': { -                u'uploader_id': u'openstreetmapus',  -                u'uploader': u'OpenStreetMap US',  -                u'title': u'Andy Allan - Putting the Carto into OpenStreetMap Cartography', +            'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', +            'file': '68093876.mp4', +            'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', +            'note': 'Vimeo Pro video (#1197)', +            'info_dict': { +                'uploader_id': 'openstreetmapus', +                'uploader': 'OpenStreetMap US', +                'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',              },          },          { -            u'url': u'http://player.vimeo.com/video/54469442', -            u'file': u'54469442.mp4', -            u'md5': u'619b811a4417aa4abe78dc653becf511', -            u'note': u'Videos that embed the url in the player page', -            u'info_dict': { -                u'title': u'Kathy Sierra: Building the minimum Badass User, Business of Software', -                u'uploader': u'The BLN & Business of Software', +            'url': 'http://player.vimeo.com/video/54469442', +            'file': '54469442.mp4', +            'md5': '619b811a4417aa4abe78dc653becf511', +            'note': 'Videos that embed the url in the player page', +            'info_dict': { +                'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', +                'uploader': 'The BLN & Business of Software', +                'uploader_id': 'theblnbusinessofsoftware',              },          },          { -            u'url': u'http://vimeo.com/68375962', -            u'file': u'68375962.mp4', -            u'md5': u'aaf896bdb7ddd6476df50007a0ac0ae7', -            u'note': u'Video protected with password', -            u'info_dict': { -                u'title': u'youtube-dl password protected test video', -                u'upload_date': u'20130614', -                u'uploader_id': u'user18948128', -                u'uploader': u'Jaime Marquínez Ferrándiz', +            'url': 'http://vimeo.com/68375962', +            'file': '68375962.mp4', +            'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', +            'note': 'Video protected with password', +            'info_dict': { +                'title': 'youtube-dl password protected test video', +                'upload_date': '20130614', +                'uploader_id': 'user18948128', +                'uploader': 'Jaime Marquínez Ferrándiz',              }, -            u'params': { -                u'videopassword': u'youtube-dl', +            'params': { +                'videopassword': 'youtube-dl',              },          },      ] @@ -90,7 +93,7 @@ class VimeoIE(InfoExtractor):          self.report_login()          login_url = 'https://vimeo.com/log_in'          webpage = self._download_webpage(login_url, None, False) -        token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) +        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')          data = compat_urllib_parse.urlencode({'email': username,                                                'password': password,                                                'action': 'login', @@ -100,13 +103,13 @@ class VimeoIE(InfoExtractor):          login_request = compat_urllib_request.Request(login_url, data)          login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')          login_request.add_header('Cookie', 'xsrft=%s' % token) -        self._download_webpage(login_request, None, False, u'Wrong login info') +        self._download_webpage(login_request, None, False, 'Wrong login info')      def _verify_video_password(self, url, video_id, webpage):          password = self._downloader.params.get('videopassword', None)          if password is None: -            raise ExtractorError(u'This video is protected by a password, use the --video-password option') -        token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) +            raise ExtractorError('This video is protected by a password, use the --video-password option') +        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')          data = compat_urllib_parse.urlencode({'password': password,                                                'token': token})          # I didn't manage to use the password with https @@ -118,8 +121,8 @@ class VimeoIE(InfoExtractor):          password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')          password_request.add_header('Cookie', 'xsrft=%s' % token)          self._download_webpage(password_request, video_id, -                               u'Verifying the password', -                               u'Wrong password') +                               'Verifying the password', +                               'Wrong password')      def _real_initialize(self):          self._login() @@ -134,7 +137,7 @@ class VimeoIE(InfoExtractor):          # Extract ID from URL          mobj = re.match(self._VALID_URL, url)          if mobj is None: -            raise ExtractorError(u'Invalid URL: %s' % url) +            raise ExtractorError('Invalid URL: %s' % url)          video_id = mobj.group('id')          if mobj.group('pro') or mobj.group('player'): @@ -155,7 +158,7 @@ class VimeoIE(InfoExtractor):          try:              try:                  config_url = self._html_search_regex( -                    r' data-config-url="(.+?)"', webpage, u'config URL') +                    r' data-config-url="(.+?)"', webpage, 'config URL')                  config_json = self._download_webpage(config_url, video_id)                  config = json.loads(config_json)              except RegexNotFoundError: @@ -166,19 +169,23 @@ class VimeoIE(InfoExtractor):                      config_re = r'%s=({.+?});' % re.escape(m_variable_name.group(1))                  else:                      config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] -                config = self._search_regex(config_re, webpage, u'info section', +                config = self._search_regex(config_re, webpage, 'info section',                      flags=re.DOTALL)                  config = json.loads(config)          except Exception as e:              if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): -                raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') +                raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')              if re.search('<form[^>]+?id="pw_form"', webpage) is not None:                  self._verify_video_password(url, video_id, webpage)                  return self._real_extract(url)              else: -                raise ExtractorError(u'Unable to extract info section', +                raise ExtractorError('Unable to extract info section',                                       cause=e) +        else: +            if config.get('view') == 4: +                self._verify_video_password(url, video_id, webpage) +                return self._real_extract(url)          # Extract title          video_title = config["video"]["title"] @@ -212,9 +219,9 @@ class VimeoIE(InfoExtractor):              video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)          try: -            view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, u'view count')) -            like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, u'like count')) -            comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, u'comment count')) +            view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) +            like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count')) +            comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))          except RegexNotFoundError:              # This info is only available in vimeo.com/{id} urls              view_count = None @@ -255,7 +262,7 @@ class VimeoIE(InfoExtractor):          for key in ('other', 'sd', 'hd'):              formats += files[key]          if len(formats) == 0: -            raise ExtractorError(u'No known codec found') +            raise ExtractorError('No known codec found')          return {              'id':       video_id, @@ -274,7 +281,7 @@ class VimeoIE(InfoExtractor):  class VimeoChannelIE(InfoExtractor): -    IE_NAME = u'vimeo:channel' +    IE_NAME = 'vimeo:channel'      _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'      _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'      _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"' @@ -283,14 +290,14 @@ class VimeoChannelIE(InfoExtractor):          return '%s/videos/page:%d/' % (base_url, pagenum)      def _extract_list_title(self, webpage): -        return self._html_search_regex(self._TITLE_RE, webpage, u'list title') +        return self._html_search_regex(self._TITLE_RE, webpage, 'list title')      def _extract_videos(self, list_id, base_url):          video_ids = []          for pagenum in itertools.count(1):              webpage = self._download_webpage(                  self._page_url(base_url, pagenum) ,list_id, -                u'Downloading page %s' % pagenum) +                'Downloading page %s' % pagenum)              video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:                  break @@ -310,8 +317,8 @@ class VimeoChannelIE(InfoExtractor):  class VimeoUserIE(VimeoChannelIE): -    IE_NAME = u'vimeo:user' -    _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)' +    IE_NAME = 'vimeo:user' +    _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)(?:[#?]|$)'      _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'      @classmethod @@ -327,7 +334,7 @@ class VimeoUserIE(VimeoChannelIE):  class VimeoAlbumIE(VimeoChannelIE): -    IE_NAME = u'vimeo:album' +    IE_NAME = 'vimeo:album'      _VALID_URL = r'(?:https?://)?vimeo.\com/album/(?P<id>\d+)'      _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' @@ -336,12 +343,12 @@ class VimeoAlbumIE(VimeoChannelIE):      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        album_id =  mobj.group('id') +        album_id = mobj.group('id')          return self._extract_videos(album_id, 'http://vimeo.com/album/%s' % album_id)  class VimeoGroupsIE(VimeoAlbumIE): -    IE_NAME = u'vimeo:group' +    IE_NAME = 'vimeo:group'      _VALID_URL = r'(?:https?://)?vimeo.\com/groups/(?P<name>[^/]+)'      def _extract_list_title(self, webpage): @@ -351,3 +358,24 @@ class VimeoGroupsIE(VimeoAlbumIE):          mobj = re.match(self._VALID_URL, url)          name = mobj.group('name')          return self._extract_videos(name, 'http://vimeo.com/groups/%s' % name) + + +class VimeoReviewIE(InfoExtractor): +    IE_NAME = 'vimeo:review' +    IE_DESC = 'Review pages on vimeo' +    _VALID_URL = r'(?:https?://)?vimeo.\com/[^/]+/review/(?P<id>[^/]+)' +    _TEST = { +        'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', +        'file': '75524534.mp4', +        'md5': 'c507a72f780cacc12b2248bb4006d253', +        'info_dict': { +            'title': "DICK HARDWICK 'Comedian'", +            'uploader': 'Richard Hardwick', +        } +    } + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        player_url = 'https://player.vimeo.com/player/' + video_id +        return self.url_result(player_url, 'Vimeo', video_id) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index e1748c261..bc31c2e64 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -44,8 +44,10 @@ class WistiaIE(InfoExtractor):                  'height': a['height'],                  'filesize': a['size'],                  'ext': a['ext'], +                'preference': 1 if atype == 'original' else None,              }) -        formats.sort(key=lambda a: a['filesize']) + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 5c9c361b9..e17a39782 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -6,8 +6,8 @@ from .common import InfoExtractor, SearchInfoExtractor  from ..utils import (      compat_urllib_parse,      compat_urlparse, -    determine_ext,      clean_html, +    int_or_none,  ) @@ -68,9 +68,9 @@ class YahooIE(InfoExtractor):          formats = []          for s in info['streams']:              format_info = { -                'width': s.get('width'), -                'height': s.get('height'), -                'bitrate': s.get('bitrate'), +                'width': int_or_none(s.get('width')), +                'height': int_or_none(s.get('height')), +                'tbr': int_or_none(s.get('bitrate')),              }              host = s['host'] @@ -84,10 +84,10 @@ class YahooIE(InfoExtractor):              else:                  format_url = compat_urlparse.urljoin(host, path)                  format_info['url'] = format_url -                format_info['ext'] = determine_ext(format_url)              formats.append(format_info) -        formats = sorted(formats, key=lambda f:(f['height'], f['width'])) + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index bd0f2cae0..77ad423c4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,5 +1,4 @@  import json -import os  import re  import sys @@ -16,6 +15,7 @@ from ..aes import (      aes_decrypt_text  ) +  class YouPornIE(InfoExtractor):      _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'      _TEST = { @@ -23,9 +23,9 @@ class YouPornIE(InfoExtractor):          u'file': u'505835.mp4',          u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',          u'info_dict': { -            u"upload_date": u"20101221",  -            u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",  -            u"uploader": u"Ask Dan And Jennifer",  +            u"upload_date": u"20101221", +            u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", +            u"uploader": u"Ask Dan And Jennifer",              u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",              u"age_limit": 18,          } @@ -71,38 +71,36 @@ class YouPornIE(InfoExtractor):              link = aes_decrypt_text(encrypted_link, video_title, 32).decode('utf-8')              links.append(link) -        if not links: -            raise ExtractorError(u'ERROR: no known formats available for video') -          formats = []          for link in links: -              # A link looks like this:              # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0              # A path looks like this:              # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4              video_url = unescapeHTML(link)              path = compat_urllib_parse_urlparse(video_url).path -            extension = os.path.splitext(path)[1][1:] -            format = path.split('/')[4].split('_')[:2] +            format_parts = path.split('/')[4].split('_')[:2] -            # size = format[0] -            # bitrate = format[1] -            format = "-".join(format) -            # title = u'%s-%s-%s' % (video_title, size, bitrate) +            dn = compat_urllib_parse_urlparse(video_url).netloc.partition('.')[0] + +            resolution = format_parts[0] +            height = int(resolution[:-len('p')]) +            bitrate = int(format_parts[1][:-len('k')]) +            format = u'-'.join(format_parts) + u'-' + dn              formats.append({                  'url': video_url, -                'ext': extension,                  'format': format,                  'format_id': format, +                'height': height, +                'tbr': bitrate, +                'resolution': resolution,              }) -        # Sort and remove doubles -        formats.sort(key=lambda format: list(map(lambda s: s.zfill(6), format['format'].split('-')))) -        for i in range(len(formats)-1,0,-1): -            if formats[i]['format_id'] == formats[i-1]['format_id']: -                del formats[i] +        self._sort_formats(formats) + +        if not formats: +            raise ExtractorError(u'ERROR: no known formats available for video')          return {              'id': video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a68576547..9424d5e26 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -150,168 +150,69 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                       (?(1).+)?                                                # if we found the ID, everything can follow                       $"""      _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' -    # Listed in order of quality -    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', -                          # Apple HTTP Live Streaming -                          '96', '95', '94', '93', '92', '132', '151', -                          # 3D -                          '85', '84', '102', '83', '101', '82', '100', -                          # Dash video -                          '138', '137', '248', '136', '247', '135', '246', -                          '245', '244', '134', '243', '133', '242', '160', -                          # Dash audio -                          '141', '172', '140', '171', '139', -                          ] -    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', -                                      # Apple HTTP Live Streaming -                                      '96', '95', '94', '93', '92', '132', '151', -                                      # 3D -                                      '85', '102', '84', '101', '83', '100', '82', -                                      # Dash video -                                      '138', '248', '137', '247', '136', '246', '245', -                                      '244', '135', '243', '134', '242', '133', '160', -                                      # Dash audio -                                      '172', '141', '171', '140', '139', -                                      ] -    _video_formats_map = { -        'flv': ['35', '34', '6', '5'], -        '3gp': ['36', '17', '13'], -        'mp4': ['38', '37', '22', '18'], -        'webm': ['46', '45', '44', '43'], -    } -    _video_extensions = { -        '13': '3gp', -        '17': '3gp', -        '18': 'mp4', -        '22': 'mp4', -        '36': '3gp', -        '37': 'mp4', -        '38': 'mp4', -        '43': 'webm', -        '44': 'webm', -        '45': 'webm', -        '46': 'webm', +    _formats = { +        '5': {'ext': 'flv', 'width': 400, 'height': 240}, +        '6': {'ext': 'flv', 'width': 450, 'height': 270}, +        '13': {'ext': '3gp'}, +        '17': {'ext': '3gp', 'width': 176, 'height': 144}, +        '18': {'ext': 'mp4', 'width': 640, 'height': 360}, +        '22': {'ext': 'mp4', 'width': 1280, 'height': 720}, +        '34': {'ext': 'flv', 'width': 640, 'height': 360}, +        '35': {'ext': 'flv', 'width': 854, 'height': 480}, +        '36': {'ext': '3gp', 'width': 320, 'height': 240}, +        '37': {'ext': 'mp4', 'width': 1920, 'height': 1080}, +        '38': {'ext': 'mp4', 'width': 4096, 'height': 3072}, +        '43': {'ext': 'webm', 'width': 640, 'height': 360}, +        '44': {'ext': 'webm', 'width': 854, 'height': 480}, +        '45': {'ext': 'webm', 'width': 1280, 'height': 720}, +        '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, +          # 3d videos -        '82': 'mp4', -        '83': 'mp4', -        '84': 'mp4', -        '85': 'mp4', -        '100': 'webm', -        '101': 'webm', -        '102': 'webm', +        '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, +        '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, +        '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, +        '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20}, +        '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, +        '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, +        '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},          # Apple HTTP Live Streaming -        '92': 'mp4', -        '93': 'mp4', -        '94': 'mp4', -        '95': 'mp4', -        '96': 'mp4', -        '132': 'mp4', -        '151': 'mp4', - -        # Dash mp4 -        '133': 'mp4', -        '134': 'mp4', -        '135': 'mp4', -        '136': 'mp4', -        '137': 'mp4', -        '138': 'mp4', -        '160': 'mp4', +        '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, +        '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10}, +        '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10}, +        '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10}, +        '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10}, +        '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, +        '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10}, + +        # DASH mp4 video +        '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40}, +        '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40}, +        '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40}, +        '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40}, +        '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, +        '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40}, +        '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40}, +        '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},          # Dash mp4 audio -        '139': 'm4a', -        '140': 'm4a', -        '141': 'm4a', +        '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, +        '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, +        '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},          # Dash webm -        '171': 'webm', -        '172': 'webm', -        '242': 'webm', -        '243': 'webm', -        '244': 'webm', -        '245': 'webm', -        '246': 'webm', -        '247': 'webm', -        '248': 'webm', -    } -    _video_dimensions = { -        '5': '400x240', -        '6': '???', -        '13': '???', -        '17': '176x144', -        '18': '640x360', -        '22': '1280x720', -        '34': '640x360', -        '35': '854x480', -        '36': '320x240', -        '37': '1920x1080', -        '38': '4096x3072', -        '43': '640x360', -        '44': '854x480', -        '45': '1280x720', -        '46': '1920x1080', -        '82': '360p', -        '83': '480p', -        '84': '720p', -        '85': '1080p', -        '92': '240p', -        '93': '360p', -        '94': '480p', -        '95': '720p', -        '96': '1080p', -        '100': '360p', -        '101': '480p', -        '102': '720p', -        '132': '240p', -        '151': '72p', -        '133': '240p', -        '134': '360p', -        '135': '480p', -        '136': '720p', -        '137': '1080p', -        '138': '>1080p', -        '139': '48k', -        '140': '128k', -        '141': '256k', -        '160': '192p', -        '171': '128k', -        '172': '256k', -        '242': '240p', -        '243': '360p', -        '244': '480p', -        '245': '480p', -        '246': '480p', -        '247': '720p', -        '248': '1080p', -    } -    _special_itags = { -        '82': '3D', -        '83': '3D', -        '84': '3D', -        '85': '3D', -        '100': '3D', -        '101': '3D', -        '102': '3D', -        '133': 'DASH Video', -        '134': 'DASH Video', -        '135': 'DASH Video', -        '136': 'DASH Video', -        '137': 'DASH Video', -        '138': 'DASH Video', -        '139': 'DASH Audio', -        '140': 'DASH Audio', -        '141': 'DASH Audio', -        '160': 'DASH Video', -        '171': 'DASH Audio', -        '172': 'DASH Audio', -        '242': 'DASH Video', -        '243': 'DASH Video', -        '244': 'DASH Video', -        '245': 'DASH Video', -        '246': 'DASH Video', -        '247': 'DASH Video', -        '248': 'DASH Video', +        '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, +        '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, +        '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, +        '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, +        '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, +        '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40}, +        '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40}, + +        # Dash webm audio +        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, +        '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},      }      IE_NAME = u'youtube' @@ -1153,13 +1054,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):              self._downloader.report_warning(err_msg)              return {} -    def _print_formats(self, formats): -        print('Available formats:') -        for x in formats: -            print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'), -                                        self._video_dimensions.get(x, '???'), -                                        ' ('+self._special_itags[x]+')' if x in self._special_itags else '')) -      def _extract_id(self, url):          mobj = re.match(self._VALID_URL, url, re.VERBOSE)          if mobj is None: @@ -1172,48 +1066,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):          Transform a dictionary in the format {itag:url} to a list of (itag, url)          with the requested formats.          """ -        req_format = self._downloader.params.get('format', None) -        format_limit = self._downloader.params.get('format_limit', None) -        available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats -        if format_limit is not None and format_limit in available_formats: -            format_list = available_formats[available_formats.index(format_limit):] -        else: -            format_list = available_formats -        existing_formats = [x for x in format_list if x in url_map] +        existing_formats = [x for x in self._formats if x in url_map]          if len(existing_formats) == 0:              raise ExtractorError(u'no known formats available for video') -        if self._downloader.params.get('listformats', None): -            self._print_formats(existing_formats) -            return -        if req_format is None or req_format == 'best': -            video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality -        elif req_format == 'worst': -            video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality -        elif req_format in ('-1', 'all'): -            video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats -        else: -            # Specific formats. We pick the first in a slash-delimeted sequence. -            # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality -            # available in the specified format. For example, -            # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. -            # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. -            # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. -            req_formats = req_format.split('/') -            video_url_list = None -            for rf in req_formats: -                if rf in url_map: -                    video_url_list = [(rf, url_map[rf])] -                    break -                if rf in self._video_formats_map: -                    for srf in self._video_formats_map[rf]: -                        if srf in url_map: -                            video_url_list = [(srf, url_map[srf])] -                            break -                    else: -                        continue -                    break -            if video_url_list is None: -                raise ExtractorError(u'requested format not available') +        video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats +        video_url_list.reverse() # order worst to best          return video_url_list      def _extract_from_m3u8(self, manifest_url, video_id): @@ -1462,50 +1319,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):                          url += '&ratebypass=yes'                      url_map[url_data['itag'][0]] = url              video_url_list = self._get_video_url_list(url_map) -            if not video_url_list: -                return          elif video_info.get('hlsvp'):              manifest_url = video_info['hlsvp'][0]              url_map = self._extract_from_m3u8(manifest_url, video_id)              video_url_list = self._get_video_url_list(url_map) -            if not video_url_list: -                return -          else:              raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') -        results = [] +        formats = []          for itag, video_real_url in video_url_list: -            # Extension -            video_extension = self._video_extensions.get(itag, 'flv') - -            video_format = '{0} - {1}{2}'.format(itag if itag else video_extension, -                                              self._video_dimensions.get(itag, '???'), -                                              ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '') - -            results.append({ -                'id':       video_id, -                'url':      video_real_url, -                'uploader': video_uploader, -                'uploader_id': video_uploader_id, -                'upload_date':  upload_date, -                'title':    video_title, -                'ext':      video_extension, -                'format':   video_format, +            dct = {                  'format_id': itag, -                'thumbnail':    video_thumbnail, -                'description':  video_description, -                'player_url':   player_url, -                'subtitles':    video_subtitles, -                'duration':     video_duration, -                'age_limit':    18 if age_gate else 0, -                'annotations':  video_annotations, -                'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, -                'view_count': view_count, -                'like_count': like_count, -                'dislike_count': dislike_count, -            }) -        return results +                'url': video_real_url, +                'player_url': player_url, +            } +            dct.update(self._formats[itag]) +            formats.append(dct) + +        self._sort_formats(formats) + +        return { +            'id':           video_id, +            'uploader':     video_uploader, +            'uploader_id':  video_uploader_id, +            'upload_date':  upload_date, +            'title':        video_title, +            'thumbnail':    video_thumbnail, +            'description':  video_description, +            'subtitles':    video_subtitles, +            'duration':     video_duration, +            'age_limit':    18 if age_gate else 0, +            'annotations':  video_annotations, +            'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, +            'view_count':   view_count, +            'like_count': like_count, +            'dislike_count': dislike_count, +            'formats':      formats, +        }  class YoutubePlaylistIE(YoutubeBaseInfoExtractor):      IE_DESC = u'YouTube.com playlists' diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 35ece354a..829f002cf 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,10 +1,10 @@  # coding: utf-8 -import operator  import re  from .common import InfoExtractor  from ..utils import ( +    int_or_none,      unified_strdate,  ) @@ -67,29 +67,13 @@ class ZDFIE(InfoExtractor):              ''', format_id)              ext = format_m.group('container') -            is_supported = ext != 'f4f' - -            PROTO_ORDER = ['http', 'rtmp', 'rtsp'] -            try: -                proto_pref = -PROTO_ORDER.index(format_m.group('proto')) -            except ValueError: -                proto_pref = -999 +            proto = format_m.group('proto').lower()              quality = fnode.find('./quality').text -            QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low'] -            try: -                quality_pref = -QUALITY_ORDER.index(quality) -            except ValueError: -                quality_pref = -999 -              abr = int(fnode.find('./audioBitrate').text) // 1000              vbr = int(fnode.find('./videoBitrate').text) // 1000 -            pref = (is_available, is_supported, -                    proto_pref, quality_pref, vbr, abr)              format_note = u'' -            if not is_supported: -                format_note += u'(unsupported)'              if not format_note:                  format_note = None @@ -101,18 +85,20 @@ class ZDFIE(InfoExtractor):                  'vcodec': format_m.group('vcodec'),                  'abr': abr,                  'vbr': vbr, -                'width': int(fnode.find('./width').text), -                'height': int(fnode.find('./height').text), -                'filesize': int(fnode.find('./filesize').text), +                'width': int_or_none(fnode.find('./width').text), +                'height': int_or_none(fnode.find('./height').text), +                'filesize': int_or_none(fnode.find('./filesize').text),                  'format_note': format_note, -                '_pref': pref, +                'protocol': proto,                  '_available': is_available,              }          format_nodes = doc.findall('.//formitaeten/formitaet') -        formats = sorted(filter(lambda f: f['_available'], -                                map(xml_to_format, format_nodes)), -                         key=operator.itemgetter('_pref')) +        formats = list(filter( +            lambda f: f['_available'], +            map(xml_to_format, format_nodes))) + +        self._sort_formats(formats)          return {              'id': video_id, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 20ebea38c..a509f8e2f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -500,12 +500,13 @@ def unescapeHTML(s):      result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)      return result -def encodeFilename(s): + +def encodeFilename(s, for_subprocess=False):      """      @param s The name of the file      """ -    assert type(s) == type(u'') +    assert type(s) == compat_str      # Python 3 has a Unicode API      if sys.version_info >= (3, 0): @@ -515,12 +516,18 @@ def encodeFilename(s):          # Pass u'' directly to use Unicode APIs on Windows 2000 and up          # (Detecting Windows NT 4 is tricky because 'major >= 4' would          # match Windows 9x series as well. Besides, NT 4 is obsolete.) -        return s +        if not for_subprocess: +            return s +        else: +            # For subprocess calls, encode with locale encoding +            # Refer to http://stackoverflow.com/a/9951851/35070 +            encoding = preferredencoding()      else:          encoding = sys.getfilesystemencoding() -        if encoding is None: -            encoding = 'utf-8' -        return s.encode(encoding, 'ignore') +    if encoding is None: +        encoding = 'utf-8' +    return s.encode(encoding, 'ignore') +  def decodeOption(optval):      if optval is None: @@ -539,7 +546,8 @@ def formatSeconds(secs):      else:          return '%d' % secs -def make_HTTPS_handler(opts_no_check_certificate): + +def make_HTTPS_handler(opts_no_check_certificate, **kwargs):      if sys.version_info < (3, 2):          import httplib @@ -560,7 +568,7 @@ def make_HTTPS_handler(opts_no_check_certificate):          class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):              def https_open(self, req):                  return self.do_open(HTTPSConnectionV3, req) -        return HTTPSHandlerV3() +        return HTTPSHandlerV3(**kwargs)      else:          context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)          context.verify_mode = (ssl.CERT_NONE @@ -571,7 +579,7 @@ def make_HTTPS_handler(opts_no_check_certificate):              context.load_default_certs()          except AttributeError:              pass  # Python < 3.4 -        return compat_urllib_request.HTTPSHandler(context=context) +        return compat_urllib_request.HTTPSHandler(context=context, **kwargs)  class ExtractorError(Exception):      """Error during info extraction.""" @@ -756,6 +764,7 @@ def unified_strdate(date_str):          '%Y-%m-%d',          '%d/%m/%Y',          '%Y/%m/%d %H:%M:%S', +        '%Y-%m-%d %H:%M:%S',          '%d.%m.%Y %H:%M',          '%Y-%m-%dT%H:%M:%SZ',          '%Y-%m-%dT%H:%M:%S.%fZ', @@ -858,12 +867,22 @@ def platform_name():  def write_string(s, out=None):      if out is None:          out = sys.stderr -    assert type(s) == type(u'') +    assert type(s) == compat_str      if ('b' in getattr(out, 'mode', '') or              sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr          s = s.encode(preferredencoding(), 'ignore') -    out.write(s) +    try: +        out.write(s) +    except UnicodeEncodeError: +        # In Windows shells, this can fail even when the codec is just charmap!? +        # See https://wiki.python.org/moin/PrintFails#Issue +        if sys.platform == 'win32' and hasattr(out, 'encoding'): +            s = s.encode(out.encoding, 'ignore').decode(out.encoding) +            out.write(s) +        else: +            raise +      out.flush() @@ -1017,9 +1036,9 @@ def smuggle_url(url, data):      return url + u'#' + sdata -def unsmuggle_url(smug_url): +def unsmuggle_url(smug_url, default=None):      if not '#__youtubedl_smuggle' in smug_url: -        return smug_url, None +        return smug_url, default      url, _, sdata = smug_url.rpartition(u'#')      jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]      data = json.loads(jsond) @@ -1079,7 +1098,7 @@ def fix_xml_all_ampersand(xml_str):  def setproctitle(title): -    assert isinstance(title, type(u'')) +    assert isinstance(title, compat_str)      try:          libc = ctypes.cdll.LoadLibrary("libc.so.6")      except OSError: @@ -1107,3 +1126,28 @@ def url_basename(url):  class HEADRequest(compat_urllib_request.Request):      def get_method(self):          return "HEAD" + + +def int_or_none(v): +    return v if v is None else int(v) + + +def parse_duration(s): +    if s is None: +        return None + +    m = re.match( +        r'(?:(?:(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)$', s) +    if not m: +        return None +    res = int(m.group('secs')) +    if m.group('mins'): +        res += int(m.group('mins')) * 60 +        if m.group('hours'): +            res += int(m.group('hours')) * 60 * 60 +    return res + + +def prepend_extension(filename, ext): +    name, real_ext = os.path.splitext(filename)  +    return u'{0}.{1}{2}'.format(name, ext, real_ext) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c13af8abd..246233e7e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.12.20' +__version__ = '2014.01.07' | 
