diff options
79 files changed, 1805 insertions, 617 deletions
| diff --git a/.gitignore b/.gitignore index 0422adf44..26dbde73d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@  *.pyc  *.pyo +*.class  *~  *.DS_Store  wine-py2exe/ @@ -32,4 +33,4 @@ test/testdata  .tox  youtube-dl.zsh  .idea -.idea/*
\ No newline at end of file +.idea/* @@ -161,3 +161,5 @@ Jens Wille  Robin Houtevelts  Patrick Griffis  Aidan Rowe +mutantmonkey +Ben Congdon @@ -3,6 +3,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas  clean:  	rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe  	find . -name "*.pyc" -delete +	find . -name "*.class" -delete  PREFIX ?= /usr/local  BINDIR ?= $(PREFIX)/bin @@ -44,7 +45,7 @@ test:  ot: offlinetest  offlinetest: codetest -	nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py +	$(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py  tar: youtube-dl.tar.gz @@ -80,6 +80,8 @@ which means you can modify it, redistribute it or use it however you like.                                       on Windows)      --flat-playlist                  Do not extract the videos of a playlist,                                       only list them. +    --mark-watched                   Mark videos watched (YouTube only) +    --no-mark-watched                Do not mark videos watched (YouTube only)      --no-color                       Do not emit color codes in output  ## Network Options: @@ -179,7 +181,7 @@ which means you can modify it, redistribute it or use it however you like.                                       to play it)      --external-downloader COMMAND    Use the specified external downloader.                                       Currently supports -                                     aria2c,axel,curl,httpie,wget +                                     aria2c,avconv,axel,curl,ffmpeg,httpie,wget      --external-downloader-args ARGS  Give these arguments to the external                                       downloader diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43403233d..a6dcc2576 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -54,6 +54,7 @@   - **AtresPlayer**   - **ATTTechChannel**   - **AudiMedia** + - **AudioBoom**   - **audiomack**   - **audiomack:album**   - **Azubu** @@ -167,6 +168,8 @@   - **Dump**   - **Dumpert**   - **dvtv**: http://video.aktualne.cz/ + - **dw** + - **dw:article**   - **EaglePlatform**   - **EbaumsWorld**   - **EchoMsk** @@ -190,10 +193,10 @@   - **ExpoTV**   - **ExtremeTube**   - **facebook** - - **facebook:post**   - **faz.net**   - **fc2**   - **Fczenit** + - **features.aol.com**   - **fernsehkritik.tv**   - **Firstpost**   - **FiveTV** @@ -293,6 +296,7 @@   - **kontrtube**: KontrTube.ru - Труба зовёт   - **KrasView**: Красвью   - **Ku6** + - **KUSI**   - **kuwo:album**: 酷我音乐 - 专辑   - **kuwo:category**: 酷我音乐 - 分类   - **kuwo:chart**: 酷我音乐 - 排行榜 @@ -301,12 +305,11 @@   - **kuwo:song**: 酷我音乐   - **la7.tv**   - **Laola1Tv** + - **Le**: 乐视网   - **Lecture2Go**   - **Lemonde** - - **Letv**: 乐视网 + - **LePlaylist**   - **LetvCloud**: 乐视云 - - **LetvPlaylist** - - **LetvTv**   - **Libsyn**   - **life:embed**   - **lifenews**: LIFE | NEWS @@ -324,6 +327,7 @@   - **m6**   - **macgamestore**: MacGameStore trailers   - **mailru**: Видео@Mail.Ru + - **MakersChannel**   - **MakerTV**   - **Malemotion**   - **MatchTV** @@ -334,6 +338,7 @@   - **Mgoon**   - **Minhateca**   - **MinistryGrid** + - **Minoto**   - **miomio.tv**   - **MiTele**: mitele.es   - **mixcloud** @@ -421,6 +426,7 @@   - **Npr**   - **NRK**   - **NRKPlaylist** + - **NRKSkole**: NRK Skole   - **NRKTV**: NRK TV and NRK Radio   - **ntv.ru**   - **Nuvid** @@ -669,6 +675,7 @@   - **UDNEmbed**: 聯合影音   - **Unistra**   - **Urort**: NRK P3 Urørt + - **USAToday**   - **ustream**   - **ustream:channel**   - **Ustudio** @@ -682,6 +689,7 @@   - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet   - **vh1.com**   - **Vice** + - **ViceShow**   - **Viddler**   - **video.google:search**: Google Video search   - **video.mit.edu** @@ -709,6 +717,7 @@   - **vimeo:channel**   - **vimeo:group**   - **vimeo:likes**: Vimeo user likes + - **vimeo:ondemand**   - **vimeo:review**: Review pages on vimeo   - **vimeo:user**   - **vimeo:watchlater**: Vimeo watch later list, "vimeowatchlater" keyword (requires authentication) diff --git a/test/helper.py b/test/helper.py index bdd7acca4..f2d878212 100644 --- a/test/helper.py +++ b/test/helper.py @@ -11,8 +11,11 @@ import sys  import youtube_dl.extractor  from youtube_dl import YoutubeDL -from youtube_dl.utils import ( +from youtube_dl.compat import ( +    compat_os_name,      compat_str, +) +from youtube_dl.utils import (      preferredencoding,      write_string,  ) @@ -42,7 +45,7 @@ def report_warning(message):      Print the message to stderr, it will be prefixed with 'WARNING:'      If stderr is a tty file the 'WARNING:' will be colored      ''' -    if sys.stderr.isatty() and os.name != 'nt': +    if sys.stderr.isatty() and compat_os_name != 'nt':          _msg_header = '\033[0;33mWARNING:\033[0m'      else:          _msg_header = 'WARNING:' diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 59f7ab49d..efbee3b71 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -502,6 +502,9 @@ class TestYoutubeDL(unittest.TestCase):          assertRegexpMatches(self, ydl._format_note({              'vbr': 10,          }), '^\s*10k$') +        assertRegexpMatches(self, ydl._format_note({ +            'fps': 30, +        }), '^30fps$')      def test_postprocessors(self):          filename = 'post-processor-testfile.mp4' diff --git a/test/test_http.py b/test/test_http.py index f2e305b6f..fc59b1aed 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -52,7 +52,12 @@ class TestHTTP(unittest.TestCase):              ('localhost', 0), HTTPTestRequestHandler)          self.httpd.socket = ssl.wrap_socket(              self.httpd.socket, certfile=certfn, server_side=True) -        self.port = self.httpd.socket.getsockname()[1] +        if os.name == 'java': +            # In Jython SSLSocket is not a subclass of socket.socket +            sock = self.httpd.socket.sock +        else: +            sock = self.httpd.socket +        self.port = sock.getsockname()[1]          self.server_thread = threading.Thread(target=self.httpd.serve_forever)          self.server_thread.daemon = True          self.server_thread.start() diff --git a/test/test_utils.py b/test/test_utils.py index cb85e18f0..5a0109977 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -42,6 +42,7 @@ from youtube_dl.utils import (      orderedSet,      parse_duration,      parse_filesize, +    parse_count,      parse_iso8601,      read_batch_urls,      sanitize_filename, @@ -62,6 +63,7 @@ from youtube_dl.utils import (      lowercase_escape,      url_basename,      urlencode_postdata, +    update_url_query,      version_tuple,      xpath_with_ns,      xpath_element, @@ -78,6 +80,8 @@ from youtube_dl.utils import (  from youtube_dl.compat import (      compat_chr,      compat_etree_fromstring, +    compat_urlparse, +    compat_parse_qs,  ) @@ -456,6 +460,40 @@ class TestUtil(unittest.TestCase):          data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})          self.assertTrue(isinstance(data, bytes)) +    def test_update_url_query(self): +        def query_dict(url): +            return compat_parse_qs(compat_urlparse.urlparse(url).query) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})), +            query_dict('http://example.com/path?quality=HD&format=mp4')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})), +            query_dict('http://example.com/path?system=LINUX&system=WINDOWS')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'fields': 'id,formats,subtitles'})), +            query_dict('http://example.com/path?fields=id,formats,subtitles')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})), +            query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path?manifest=f4m', {'manifest': []})), +            query_dict('http://example.com/path')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})), +            query_dict('http://example.com/path?system=LINUX')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'fields': b'id,formats,subtitles'})), +            query_dict('http://example.com/path?fields=id,formats,subtitles')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'width': 1080, 'height': 720})), +            query_dict('http://example.com/path?width=1080&height=720')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'bitrate': 5020.43})), +            query_dict('http://example.com/path?bitrate=5020.43')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'test': '第二行тест'})), +            query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) +      def test_dict_get(self):          FALSE_VALUES = {              'none': None, @@ -656,6 +694,15 @@ class TestUtil(unittest.TestCase):          self.assertEqual(parse_filesize('1.2Tb'), 1200000000000)          self.assertEqual(parse_filesize('1,24 KB'), 1240) +    def test_parse_count(self): +        self.assertEqual(parse_count(None), None) +        self.assertEqual(parse_count(''), None) +        self.assertEqual(parse_count('0'), 0) +        self.assertEqual(parse_count('1000'), 1000) +        self.assertEqual(parse_count('1.000'), 1000) +        self.assertEqual(parse_count('1.1k'), 1100) +        self.assertEqual(parse_count('1.1kk'), 1100000) +      def test_version_tuple(self):          self.assertEqual(version_tuple('1'), (1,))          self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8f3a8b9e3..8c651cd52 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,9 +24,6 @@ import time  import tokenize  import traceback -if os.name == 'nt': -    import ctypes -  from .compat import (      compat_basestring,      compat_cookiejar, @@ -34,6 +31,7 @@ from .compat import (      compat_get_terminal_size,      compat_http_client,      compat_kwargs, +    compat_os_name,      compat_str,      compat_tokenize_tokenize,      compat_urllib_error, @@ -87,6 +85,7 @@ from .extractor import get_info_extractor, gen_extractors  from .downloader import get_suitable_downloader  from .downloader.rtmp import rtmpdump_version  from .postprocessor import ( +    FFmpegFixupM3u8PP,      FFmpegFixupM4aPP,      FFmpegFixupStretchedPP,      FFmpegMergerPP, @@ -95,6 +94,9 @@ from .postprocessor import (  )  from .version import __version__ +if compat_os_name == 'nt': +    import ctypes +  class YoutubeDL(object):      """YoutubeDL class. @@ -450,7 +452,7 @@ class YoutubeDL(object):      def to_console_title(self, message):          if not self.params.get('consoletitle', False):              return -        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): +        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():              # c_wchar_p() might not be necessary if `message` is              # already of type unicode()              ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) @@ -521,7 +523,7 @@ class YoutubeDL(object):          else:              if self.params.get('no_warnings'):                  return -            if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': +            if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':                  _msg_header = '\033[0;33mWARNING:\033[0m'              else:                  _msg_header = 'WARNING:' @@ -533,7 +535,7 @@ class YoutubeDL(object):          Do the same as trouble, but prefixes the message with 'ERROR:', colored          in red if stderr is a tty file.          ''' -        if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': +        if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':              _msg_header = '\033[0;31mERROR:\033[0m'          else:              _msg_header = 'ERROR:' @@ -566,7 +568,7 @@ class YoutubeDL(object):                  elif template_dict.get('height'):                      template_dict['resolution'] = '%sp' % template_dict['height']                  elif template_dict.get('width'): -                    template_dict['resolution'] = '?x%d' % template_dict['width'] +                    template_dict['resolution'] = '%dx?' % template_dict['width']              sanitize = lambda k, v: sanitize_filename(                  compat_str(v), @@ -1232,6 +1234,10 @@ class YoutubeDL(object):                  if t.get('id') is None:                      t['id'] = '%d' % i +        if self.params.get('list_thumbnails'): +            self.list_thumbnails(info_dict) +            return +          if thumbnails and 'thumbnail' not in info_dict:              info_dict['thumbnail'] = thumbnails[-1]['url'] @@ -1333,9 +1339,6 @@ class YoutubeDL(object):          if self.params.get('listformats'):              self.list_formats(info_dict)              return -        if self.params.get('list_thumbnails'): -            self.list_thumbnails(info_dict) -            return          req_format = self.params.get('format')          if req_format is None: @@ -1637,6 +1640,8 @@ class YoutubeDL(object):                  if fixup_policy is None:                      fixup_policy = 'detect_or_warn' +                INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.' +                  stretched_ratio = info_dict.get('stretched_ratio')                  if stretched_ratio is not None and stretched_ratio != 1:                      if fixup_policy == 'warn': @@ -1649,15 +1654,18 @@ class YoutubeDL(object):                              info_dict['__postprocessors'].append(stretched_pp)                          else:                              self.report_warning( -                                '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % ( -                                    info_dict['id'], stretched_ratio)) +                                '%s: Non-uniform pixel ratio (%s). %s' +                                % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))                      else:                          assert fixup_policy in ('ignore', 'never') -                if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash': +                if (info_dict.get('requested_formats') is None and +                        info_dict.get('container') == 'm4a_dash'):                      if fixup_policy == 'warn': -                        self.report_warning('%s: writing DASH m4a. Only some players support this container.' % ( -                            info_dict['id'])) +                        self.report_warning( +                            '%s: writing DASH m4a. ' +                            'Only some players support this container.' +                            % info_dict['id'])                      elif fixup_policy == 'detect_or_warn':                          fixup_pp = FFmpegFixupM4aPP(self)                          if fixup_pp.available: @@ -1665,8 +1673,27 @@ class YoutubeDL(object):                              info_dict['__postprocessors'].append(fixup_pp)                          else:                              self.report_warning( -                                '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % ( -                                    info_dict['id'])) +                                '%s: writing DASH m4a. ' +                                'Only some players support this container. %s' +                                % (info_dict['id'], INSTALL_FFMPEG_MESSAGE)) +                    else: +                        assert fixup_policy in ('ignore', 'never') + +                if (info_dict.get('protocol') == 'm3u8_native' or +                        info_dict.get('protocol') == 'm3u8' and +                        self.params.get('hls_prefer_native')): +                    if fixup_policy == 'warn': +                        self.report_warning('%s: malformated aac bitstream.' % ( +                            info_dict['id'])) +                    elif fixup_policy == 'detect_or_warn': +                        fixup_pp = FFmpegFixupM3u8PP(self) +                        if fixup_pp.available: +                            info_dict.setdefault('__postprocessors', []) +                            info_dict['__postprocessors'].append(fixup_pp) +                        else: +                            self.report_warning( +                                '%s: malformated aac bitstream. %s' +                                % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))                      else:                          assert fixup_policy in ('ignore', 'never') @@ -1830,7 +1857,9 @@ class YoutubeDL(object):          if fdict.get('vbr') is not None:              res += '%4dk' % fdict['vbr']          if fdict.get('fps') is not None: -            res += ', %sfps' % fdict['fps'] +            if res: +                res += ', ' +            res += '%sfps' % fdict['fps']          if fdict.get('acodec') is not None:              if res:                  res += ', ' @@ -1873,13 +1902,8 @@ class YoutubeDL(object):      def list_thumbnails(self, info_dict):          thumbnails = info_dict.get('thumbnails')          if not thumbnails: -            tn_url = info_dict.get('thumbnail') -            if tn_url: -                thumbnails = [{'id': '0', 'url': tn_url}] -            else: -                self.to_screen( -                    '[info] No thumbnails present for %s' % info_dict['id']) -                return +            self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) +            return          self.to_screen(              '[info] Thumbnails for %s:' % info_dict['id']) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 7b9afc36d..74702786a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -331,6 +331,9 @@ def compat_ord(c):          return ord(c) +compat_os_name = os._name if os.name == 'java' else os.name + +  if sys.version_info >= (3, 0):      compat_getenv = os.getenv      compat_expanduser = os.path.expanduser @@ -351,7 +354,7 @@ else:      # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib      # for different platforms with correct environment variables decoding. -    if os.name == 'posix': +    if compat_os_name == 'posix':          def compat_expanduser(path):              """Expand ~ and ~user constructions.  If user or $HOME is unknown,              do nothing.""" @@ -375,7 +378,7 @@ else:                  userhome = pwent.pw_dir              userhome = userhome.rstrip('/')              return (userhome + path[i:]) or '/' -    elif os.name == 'nt' or os.name == 'ce': +    elif compat_os_name == 'nt' or compat_os_name == 'ce':          def compat_expanduser(path):              """Expand ~ and ~user constructs. @@ -562,6 +565,7 @@ __all__ = [      'compat_itertools_count',      'compat_kwargs',      'compat_ord', +    'compat_os_name',      'compat_parse_qs',      'compat_print',      'compat_shlex_split', diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index dccc59212..73b34fdae 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -1,14 +1,16 @@  from __future__ import unicode_literals  from .common import FileDownloader -from .external import get_external_downloader  from .f4m import F4mFD  from .hls import HlsFD -from .hls import NativeHlsFD  from .http import HttpFD -from .rtsp import RtspFD  from .rtmp import RtmpFD  from .dash import DashSegmentsFD +from .rtsp import RtspFD +from .external import ( +    get_external_downloader, +    FFmpegFD, +)  from ..utils import (      determine_protocol, @@ -16,8 +18,8 @@ from ..utils import (  PROTOCOL_MAP = {      'rtmp': RtmpFD, -    'm3u8_native': NativeHlsFD, -    'm3u8': HlsFD, +    'm3u8_native': HlsFD, +    'm3u8': FFmpegFD,      'mms': RtspFD,      'rtsp': RtspFD,      'f4m': F4mFD, @@ -30,14 +32,17 @@ def get_suitable_downloader(info_dict, params={}):      protocol = determine_protocol(info_dict)      info_dict['protocol'] = protocol +    # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): +    #     return FFmpegFD +      external_downloader = params.get('external_downloader')      if external_downloader is not None:          ed = get_external_downloader(external_downloader) -        if ed.supports(info_dict): +        if ed.can_download(info_dict):              return ed      if protocol == 'm3u8' and params.get('hls_prefer_native'): -        return NativeHlsFD +        return HlsFD      return PROTOCOL_MAP.get(protocol, HttpFD) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2d5154051..f39db58f6 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,6 +5,7 @@ import re  import sys  import time +from ..compat import compat_os_name  from ..utils import (      encodeFilename,      error_to_compat_str, @@ -219,7 +220,7 @@ class FileDownloader(object):          if self.params.get('progress_with_newline', False):              self.to_screen(fullmsg)          else: -            if os.name == 'nt': +            if compat_os_name == 'nt':                  prev_len = getattr(self, '_report_progress_prev_line_length',                                     0)                  if prev_len > len(fullmsg): diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 2bc011266..30277dc20 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -2,8 +2,11 @@ from __future__ import unicode_literals  import os.path  import subprocess +import sys +import re  from .common import FileDownloader +from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS  from ..utils import (      cli_option,      cli_valueless_option, @@ -11,6 +14,8 @@ from ..utils import (      cli_configuration_args,      encodeFilename,      encodeArgument, +    handle_youtubedl_headers, +    check_executable,  ) @@ -46,9 +51,17 @@ class ExternalFD(FileDownloader):          return self.params.get('external_downloader')      @classmethod +    def available(cls): +        return check_executable(cls.get_basename(), [cls.AVAILABLE_OPT]) + +    @classmethod      def supports(cls, info_dict):          return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') +    @classmethod +    def can_download(cls, info_dict): +        return cls.available() and cls.supports(info_dict) +      def _option(self, command_option, param):          return cli_option(self.params, command_option, param) @@ -76,6 +89,8 @@ class ExternalFD(FileDownloader):  class CurlFD(ExternalFD): +    AVAILABLE_OPT = '-V' +      def _make_cmd(self, tmpfilename, info_dict):          cmd = [self.exe, '--location', '-o', tmpfilename]          for key, val in info_dict['http_headers'].items(): @@ -89,6 +104,8 @@ class CurlFD(ExternalFD):  class AxelFD(ExternalFD): +    AVAILABLE_OPT = '-V' +      def _make_cmd(self, tmpfilename, info_dict):          cmd = [self.exe, '-o', tmpfilename]          for key, val in info_dict['http_headers'].items(): @@ -99,6 +116,8 @@ class AxelFD(ExternalFD):  class WgetFD(ExternalFD): +    AVAILABLE_OPT = '--version' +      def _make_cmd(self, tmpfilename, info_dict):          cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']          for key, val in info_dict['http_headers'].items(): @@ -112,6 +131,8 @@ class WgetFD(ExternalFD):  class Aria2cFD(ExternalFD): +    AVAILABLE_OPT = '-v' +      def _make_cmd(self, tmpfilename, info_dict):          cmd = [self.exe, '-c']          cmd += self._configuration_args([ @@ -130,12 +151,112 @@ class Aria2cFD(ExternalFD):  class HttpieFD(ExternalFD): +    @classmethod +    def available(cls): +        return check_executable('http', ['--version']) +      def _make_cmd(self, tmpfilename, info_dict):          cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]          for key, val in info_dict['http_headers'].items():              cmd += ['%s:%s' % (key, val)]          return cmd + +class FFmpegFD(ExternalFD): +    @classmethod +    def supports(cls, info_dict): +        return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps', 'm3u8', 'rtsp', 'rtmp', 'mms') + +    @classmethod +    def available(cls): +        return FFmpegPostProcessor().available + +    def _call_downloader(self, tmpfilename, info_dict): +        url = info_dict['url'] +        ffpp = FFmpegPostProcessor(downloader=self) +        if not ffpp.available: +            self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') +            return False +        ffpp.check_version() + +        args = [ffpp.executable, '-y'] + +        args += self._configuration_args() + +        # start_time = info_dict.get('start_time') or 0 +        # if start_time: +        #     args += ['-ss', compat_str(start_time)] +        # end_time = info_dict.get('end_time') +        # if end_time: +        #     args += ['-t', compat_str(end_time - start_time)] + +        if info_dict['http_headers'] and re.match(r'^https?://', url): +            # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: +            # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. +            headers = handle_youtubedl_headers(info_dict['http_headers']) +            args += [ +                '-headers', +                ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + +        protocol = info_dict.get('protocol') + +        if protocol == 'rtmp': +            player_url = info_dict.get('player_url') +            page_url = info_dict.get('page_url') +            app = info_dict.get('app') +            play_path = info_dict.get('play_path') +            tc_url = info_dict.get('tc_url') +            flash_version = info_dict.get('flash_version') +            live = info_dict.get('rtmp_live', False) +            if player_url is not None: +                args += ['-rtmp_swfverify', player_url] +            if page_url is not None: +                args += ['-rtmp_pageurl', page_url] +            if app is not None: +                args += ['-rtmp_app', app] +            if play_path is not None: +                args += ['-rtmp_playpath', play_path] +            if tc_url is not None: +                args += ['-rtmp_tcurl', tc_url] +            if flash_version is not None: +                args += ['-rtmp_flashver', flash_version] +            if live: +                args += ['-rtmp_live', 'live'] + +        args += ['-i', url, '-c', 'copy'] +        if protocol == 'm3u8': +            if self.params.get('hls_use_mpegts', False): +                args += ['-f', 'mpegts'] +            else: +                args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] +        elif protocol == 'rtmp': +            args += ['-f', 'flv'] +        else: +            args += ['-f', EXT_TO_OUT_FORMATS.get(info_dict['ext'], info_dict['ext'])] + +        args = [encodeArgument(opt) for opt in args] +        args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) + +        self._debug_cmd(args) + +        proc = subprocess.Popen(args, stdin=subprocess.PIPE) +        try: +            retval = proc.wait() +        except KeyboardInterrupt: +            # subprocces.run would send the SIGKILL signal to ffmpeg and the +            # mp4 file couldn't be played, but if we ask ffmpeg to quit it +            # produces a file that is playable (this is mostly useful for live +            # streams). Note that Windows is not affected and produces playable +            # files (see https://github.com/rg3/youtube-dl/issues/8300). +            if sys.platform != 'win32': +                proc.communicate(b'q') +            raise +        return retval + + +class AVconvFD(FFmpegFD): +    pass +  _BY_NAME = dict(      (klass.get_basename(), klass)      for name, klass in globals().items() diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 5bc99492b..a5bae9669 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -99,7 +99,8 @@ class FragmentFD(FileDownloader):                      state['eta'] = self.calc_eta(                          start, time_now, estimated_size,                          state['downloaded_bytes']) -                state['speed'] = s.get('speed') +                state['speed'] = s.get('speed') or ctx.get('speed') +                ctx['speed'] = state['speed']                  ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes              self._hook_progress(state) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 2a775bf00..a01dac031 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,87 +1,19 @@  from __future__ import unicode_literals -import os +import os.path  import re -import subprocess -import sys -from .common import FileDownloader  from .fragment import FragmentFD  from ..compat import compat_urlparse -from ..postprocessor.ffmpeg import FFmpegPostProcessor  from ..utils import ( -    encodeArgument,      encodeFilename,      sanitize_open, -    handle_youtubedl_headers,  ) -class HlsFD(FileDownloader): -    def real_download(self, filename, info_dict): -        url = info_dict['url'] -        self.report_destination(filename) -        tmpfilename = self.temp_name(filename) - -        ffpp = FFmpegPostProcessor(downloader=self) -        if not ffpp.available: -            self.report_error('m3u8 download detected but ffmpeg or avconv could not be found. Please install one.') -            return False -        ffpp.check_version() - -        args = [ffpp.executable, '-y'] - -        if info_dict['http_headers'] and re.match(r'^https?://', url): -            # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: -            # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. -            headers = handle_youtubedl_headers(info_dict['http_headers']) -            args += [ -                '-headers', -                ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] - -        args += ['-i', url, '-c', 'copy'] -        if self.params.get('hls_use_mpegts', False): -            args += ['-f', 'mpegts'] -        else: -            args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] - -        args = [encodeArgument(opt) for opt in args] -        args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) - -        self._debug_cmd(args) - -        proc = subprocess.Popen(args, stdin=subprocess.PIPE) -        try: -            retval = proc.wait() -        except KeyboardInterrupt: -            # subprocces.run would send the SIGKILL signal to ffmpeg and the -            # mp4 file couldn't be played, but if we ask ffmpeg to quit it -            # produces a file that is playable (this is mostly useful for live -            # streams). Note that Windows is not affected and produces playable -            # files (see https://github.com/rg3/youtube-dl/issues/8300). -            if sys.platform != 'win32': -                proc.communicate(b'q') -            raise -        if retval == 0: -            fsize = os.path.getsize(encodeFilename(tmpfilename)) -            self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) -            self.try_rename(tmpfilename, filename) -            self._hook_progress({ -                'downloaded_bytes': fsize, -                'total_bytes': fsize, -                'filename': filename, -                'status': 'finished', -            }) -            return True -        else: -            self.to_stderr('\n') -            self.report_error('%s exited with code %d' % (ffpp.basename, retval)) -            return False - - -class NativeHlsFD(FragmentFD): -    """ A more limited implementation that does not require ffmpeg """ +class HlsFD(FragmentFD): +    """ A limited implementation that does not require ffmpeg """      FD_NAME = 'hlsnative' diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 037654a23..9502d07a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,10 @@ from .alphaporno import AlphaPornoIE  from .animeondemand import AnimeOnDemandIE  from .anitube import AnitubeIE  from .anysex import AnySexIE -from .aol import AolIE +from .aol import ( +    AolIE, +    AolFeaturesIE, +)  from .allocine import AllocineIE  from .aparat import AparatIE  from .appleconnect import AppleConnectIE @@ -51,6 +54,7 @@ from .arte import (  from .atresplayer import AtresPlayerIE  from .atttechchannel import ATTTechChannelIE  from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE  from .audiomack import AudiomackIE, AudiomackAlbumIE  from .azubu import AzubuIE, AzubuLiveIE  from .baidu import BaiduVideoIE @@ -185,6 +189,10 @@ from .dumpert import DumpertIE  from .defense import DefenseGouvFrIE  from .discovery import DiscoveryIE  from .dropbox import DropboxIE +from .dw import ( +    DWIE, +    DWArticleIE, +)  from .eagleplatform import EaglePlatformIE  from .ebaumsworld import EbaumsWorldIE  from .echomsk import EchoMskIE @@ -209,10 +217,7 @@ from .everyonesmixtape import EveryonesMixtapeIE  from .exfm import ExfmIE  from .expotv import ExpoTVIE  from .extremetube import ExtremeTubeIE -from .facebook import ( -    FacebookIE, -    FacebookPostIE, -) +from .facebook import FacebookIE  from .faz import FazIE  from .fc2 import FC2IE  from .fczenit import FczenitIE @@ -340,6 +345,7 @@ from .konserthusetplay import KonserthusetPlayIE  from .kontrtube import KontrTubeIE  from .krasview import KrasViewIE  from .ku6 import Ku6IE +from .kusi import KUSIIE  from .kuwo import (      KuwoIE,      KuwoAlbumIE, @@ -383,6 +389,7 @@ from .lynda import (  from .m6 import M6IE  from .macgamestore import MacGameStoreIE  from .mailru import MailRuIE +from .makerschannel import MakersChannelIE  from .makertv import MakerTVIE  from .malemotion import MalemotionIE  from .matchtv import MatchTVIE @@ -392,6 +399,7 @@ from .metacritic import MetacriticIE  from .mgoon import MgoonIE  from .minhateca import MinhatecaIE  from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE  from .miomio import MioMioIE  from .mit import TechTVMITIE, MITIE, OCWMITIE  from .mitele import MiTeleIE @@ -590,6 +598,7 @@ from .regiotv import RegioTVIE  from .restudy import RestudyIE  from .reverbnation import ReverbNationIE  from .revision3 import Revision3IE +from .rice import RICEIE  from .ringtv import RingTVIE  from .ro220 import Ro220IE  from .rottentomatoes import RottenTomatoesIE @@ -728,6 +737,7 @@ from .theplatform import (      ThePlatformFeedIE,  )  from .thesixtyone import TheSixtyOneIE +from .thestar import TheStarIE  from .thisamericanlife import ThisAmericanLifeIE  from .thisav import ThisAVIE  from .tinypic import TinyPicIE @@ -774,6 +784,7 @@ from .tv2 import (      TV2IE,      TV2ArticleIE,  ) +from .tv3 import TV3IE  from .tv4 import TV4IE  from .tvc import (      TVCIE, @@ -813,6 +824,7 @@ from .udn import UDNEmbedIE  from .digiteka import DigitekaIE  from .unistra import UnistraIE  from .urort import UrortIE +from .usatoday import USATodayIE  from .ustream import UstreamIE, UstreamChannelIE  from .ustudio import UstudioIE  from .varzesh3 import Varzesh3IE @@ -828,7 +840,10 @@ from .vgtv import (      VGTVIE,  )  from .vh1 import VH1IE -from .vice import ViceIE +from .vice import ( +    ViceIE, +    ViceShowIE, +)  from .viddler import ViddlerIE  from .videodetective import VideoDetectiveIE  from .videofyme import VideofyMeIE @@ -855,6 +870,7 @@ from .vimeo import (      VimeoChannelIE,      VimeoGroupsIE,      VimeoLikesIE, +    VimeoOndemandIE,      VimeoReviewIE,      VimeoUserIE,      VimeoWatchLaterIE, diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 5b2c0dc9a..cddcaa489 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -13,24 +13,18 @@ class AlJazeeraIE(InfoExtractor):              'ext': 'mp4',              'title': 'The Slum - Episode 1: Deliverance',              'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', -            'uploader': 'Al Jazeera English', +            'uploader_id': '665003303001', +            'timestamp': 1411116829, +            'upload_date': '20140919',          }, -        'add_ie': ['BrightcoveLegacy'], +        'add_ie': ['BrightcoveNew'],          'skip': 'Not accessible from Travis CI server',      } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'      def _real_extract(self, url):          program_name = self._match_id(url)          webpage = self._download_webpage(url, program_name)          brightcove_id = self._search_regex(              r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - -        return { -            '_type': 'url', -            'url': ( -                'brightcove:' -                'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' -                '&%40videoPlayer={0}'.format(brightcove_id) -            ), -            'ie_key': 'BrightcoveLegacy', -        } +        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b51eafc45..b761b2cc4 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,24 +1,11 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  class AolIE(InfoExtractor):      IE_NAME = 'on.aol.com' -    _VALID_URL = r'''(?x) -        (?: -            aol-video:| -            http://on\.aol\.com/ -            (?: -                video/.*-| -                playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid= -            ) -        ) -        (?P<id>[0-9]+) -        (?:$|\?) -    ''' +    _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)'      _TESTS = [{          'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -29,42 +16,31 @@ class AolIE(InfoExtractor):              'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',          },          'add_ie': ['FiveMin'], -    }, { -        'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316', -        'info_dict': { -            'id': '152147', -            'title': 'Brace Yourself - Today\'s Weirdest News', -        }, -        'playlist_mincount': 10,      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        playlist_id = mobj.group('playlist_id') -        if not playlist_id or self._downloader.params.get('noplaylist'): -            return self.url_result('5min:%s' % video_id) +        video_id = self._match_id(url) +        return self.url_result('5min:%s' % video_id) -        self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) -        webpage = self._download_webpage(url, playlist_id) -        title = self._html_search_regex( -            r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title') -        playlist_html = self._search_regex( -            r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage, -            'playlist HTML') -        entries = [{ -            '_type': 'url', -            'url': 'aol-video:%s' % m.group('id'), -            'ie_key': 'Aol', -        } for m in re.finditer( -            r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>", -            playlist_html)] +class AolFeaturesIE(InfoExtractor): +    IE_NAME = 'features.aol.com' +    _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)' -        return { -            '_type': 'playlist', -            'id': playlist_id, -            'display_id': mobj.group('playlist_display_id'), -            'title': title, -            'entries': entries, -        } +    _TESTS = [{ +        'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', +        'md5': '7db483bb0c09c85e241f84a34238cc75', +        'info_dict': { +            'id': '519507715', +            'ext': 'mp4', +            'title': 'What To Watch - February 17, 2016', +        }, +        'add_ie': ['FiveMin'], +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        return self.url_result(self._search_regex( +            r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"', +            webpage, '5min embed url'), 'FiveMin') diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index efde7e207..3e119e21b 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -121,15 +121,18 @@ class ArteTVPlus7IE(InfoExtractor):                  json_url = compat_parse_qs(                      compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]          if json_url: -            return self._extract_from_json_url(json_url, video_id, lang) -        # Differend kind of embed URL (e.g. +            title = self._search_regex( +                r'<h3[^>]+title=(["\'])(?P<title>.+?)\1', +                webpage, 'title', default=None, group='title') +            return self._extract_from_json_url(json_url, video_id, lang, title=title) +        # Different kind of embed URL (e.g.          # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)          embed_url = self._search_regex(              r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1',              webpage, 'embed url', group='url')          return self.url_result(embed_url) -    def _extract_from_json_url(self, json_url, video_id, lang): +    def _extract_from_json_url(self, json_url, video_id, lang, title=None):          info = self._download_json(json_url, video_id)          player_info = info['videoJsonPlayer'] @@ -137,7 +140,7 @@ class ArteTVPlus7IE(InfoExtractor):          if not upload_date_str:              upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] -        title = player_info['VTI'].strip() +        title = (player_info.get('VTI') or title or player_info['VID']).strip()          subtitle = player_info.get('VSU', '').strip()          if subtitle:              title += ' - %s' % subtitle diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index 3b2effa15..aa6925623 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -10,9 +10,9 @@ from ..utils import (  class AudiMediaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)' +    _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)'      _TEST = { -        'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', +        'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',          'md5': '79a8b71c46d49042609795ab59779b66',          'info_dict': {              'id': '1565', @@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') +        raw_payload = self._search_regex([ +            r'class="amtv-embed"[^>]+id="([^"]+)"', +            r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', +        ], webpage, 'raw payload')          _, stage_mode, video_id, lang = raw_payload.split('-')          # TODO: handle s and e stage_mode (live streams and ended live streams) @@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor):                  video_version_url = video_version.get('download_url') or video_version.get('stream_url')                  if not video_version_url:                      continue -                formats.append({ +                f = {                      'url': video_version_url,                      'width': int_or_none(video_version.get('width')),                      'height': int_or_none(video_version.get('height')),                      'abr': int_or_none(video_version.get('audio_bitrate')),                      'vbr': int_or_none(video_version.get('video_bitrate')), -                }) +                } +                bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None) +                if bitrate: +                    f.update({ +                        'format_id': 'http-%s' % bitrate, +                    }) +                formats.append(f)              self._sort_formats(formats)              return { diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 000000000..2ec2d7092 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class AudioBoomIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P<id>[0-9]+)' +    _TEST = { +        'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', +        'md5': '63a8d73a055c6ed0f1e51921a10a5a76', +        'info_dict': { +            'id': '4279833', +            'ext': 'mp3', +            'title': '3/09/2016 Czaban Hour 3', +            'description': 'Guest:   Nate Davis - NFL free agency,   Guest:   Stan Gans', +            'duration': 2245.72, +            'uploader': 'Steve Czaban', +            'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', +        } +    } + +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        webpage = self._download_webpage(url, video_id) + +        clip = None + +        clip_store = self._parse_json( +            self._search_regex( +                r'data-new-clip-store=(["\'])(?P<json>{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, +                webpage, 'clip store', default='{}', group='json'), +            video_id, fatal=False) +        if clip_store: +            clips = clip_store.get('clips') +            if clips and isinstance(clips, list) and isinstance(clips[0], dict): +                clip = clips[0] + +        def from_clip(field): +            if clip: +                clip.get(field) + +        audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( +            'audio', webpage, 'audio url') +        title = from_clip('title') or self._og_search_title(webpage) +        description = from_clip('description') or self._og_search_description(webpage) + +        duration = float_or_none(from_clip('duration') or self._html_search_meta( +            'weibo:audio:duration', webpage)) + +        uploader = from_clip('author') or self._og_search_property( +            'audio:artist', webpage, 'uploader', fatal=False) +        uploader_url = from_clip('author_url') or self._html_search_meta( +            'audioboo:channel', webpage, 'uploader url') + +        return { +            'id': video_id, +            'url': audio_url, +            'title': title, +            'description': description, +            'duration': duration, +            'uploader': uploader, +            'uploader_url': uploader_url, +        } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9d0dfb961..e62b3860e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -10,7 +10,6 @@ from ..utils import (      int_or_none,      parse_duration,      parse_iso8601, -    remove_end,      unescapeHTML,  )  from ..compat import ( @@ -561,7 +560,7 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460',          'info_dict': {              'id': '3662a707-0af9-3149-963f-47bea720b460', -            'title': 'BBC Blogs - Adam Curtis - BUGGER', +            'title': 'BUGGER',          },          'playlist_count': 18,      }, { @@ -670,10 +669,18 @@ class BBCIE(BBCCoUkIE):          'url': 'http://www.bbc.com/sport/0/football/34475836',          'info_dict': {              'id': '34475836', -            'title': 'What Liverpool can expect from Klopp', +            'title': 'Jurgen Klopp: Furious football from a witty and winning coach',          },          'playlist_count': 3,      }, { +        # school report article with single video +        'url': 'http://www.bbc.co.uk/schoolreport/35744779', +        'info_dict': { +            'id': '35744779', +            'title': 'School which breaks down barriers in Jerusalem', +        }, +        'playlist_count': 1, +    }, {          # single video with playlist URL from weather section          'url': 'http://www.bbc.com/weather/features/33601775',          'only_matching': True, @@ -735,8 +742,17 @@ class BBCIE(BBCCoUkIE):          json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)          timestamp = json_ld_info.get('timestamp') +          playlist_title = json_ld_info.get('title') -        playlist_description = json_ld_info.get('description') +        if not playlist_title: +            playlist_title = self._og_search_title( +                webpage, default=None) or self._html_search_regex( +                r'<title>(.+?)</title>', webpage, 'playlist title', default=None) +            if playlist_title: +                playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + +        playlist_description = json_ld_info.get( +            'description') or self._og_search_description(webpage, default=None)          if not timestamp:              timestamp = parse_iso8601(self._search_regex( @@ -797,8 +813,6 @@ class BBCIE(BBCCoUkIE):                                  playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))          if entries: -            playlist_title = playlist_title or remove_end(self._og_search_title(webpage), ' - BBC News') -            playlist_description = playlist_description or self._og_search_description(webpage, default=None)              return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)          # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -829,10 +843,6 @@ class BBCIE(BBCCoUkIE):                  'subtitles': subtitles,              } -        playlist_title = self._html_search_regex( -            r'<title>(.*?)(?:\s*-\s*BBC [^ ]+)?</title>', webpage, 'playlist title') -        playlist_description = self._og_search_description(webpage, default=None) -          def extract_all(pattern):              return list(filter(None, map(                  lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 38bda3af5..7a8e1f60b 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor):          'add_ie': ['Ooyala'],      }, {          'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', -        'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', +        'md5': '6a5cd403418c7b01719248ca97fb0692',          'info_dict': {              'id': '2586817', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',              'timestamp': 1446839961,              'uploader': 'Sean Fay', @@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE):          'md5': '8c2c12e3af7805152675446c905d159b',          'info_dict': {              'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',              'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c947337f9..3ab383461 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import (      compat_urllib_parse_urlparse,      compat_urlparse,      compat_xml_parse_error, +    compat_HTTPError,  )  from ..utils import (      determine_ext, @@ -23,16 +24,16 @@ from ..utils import (      js_to_json,      int_or_none,      parse_iso8601, -    sanitized_Request,      unescapeHTML,      unsmuggle_url, +    update_url_query,  )  class BrightcoveLegacyIE(InfoExtractor):      IE_NAME = 'brightcove:legacy'      _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' -    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' +    _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated'      _TESTS = [          { @@ -155,8 +156,8 @@ class BrightcoveLegacyIE(InfoExtractor):          # Not all pages define this value          if playerKey is not None:              params['playerKey'] = playerKey -        # The three fields hold the id of the video -        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') +        # These fields hold the id of the video +        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')          if videoPlayer is not None:              params['@videoPlayer'] = videoPlayer          linkBase = find_param('linkBaseURL') @@ -184,8 +185,7 @@ class BrightcoveLegacyIE(InfoExtractor):      @classmethod      def _make_brightcove_url(cls, params): -        data = compat_urllib_parse.urlencode(params) -        return cls._FEDERATED_URL_TEMPLATE % data +        return update_url_query(cls._FEDERATED_URL, params)      @classmethod      def _extract_brightcove_url(cls, webpage): @@ -239,7 +239,7 @@ class BrightcoveLegacyIE(InfoExtractor):              # We set the original url as the default 'Referer' header              referer = smuggled_data.get('Referer', url)              return self._get_video_info( -                videoPlayer[0], query_str, query, referer=referer) +                videoPlayer[0], query, referer=referer)          elif 'playerKey' in query:              player_key = query['playerKey']              return self._get_playlist_info(player_key[0]) @@ -248,15 +248,14 @@ class BrightcoveLegacyIE(InfoExtractor):                  'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',                  expected=True) -    def _get_video_info(self, video_id, query_str, query, referer=None): -        request_url = self._FEDERATED_URL_TEMPLATE % query_str -        req = sanitized_Request(request_url) +    def _get_video_info(self, video_id, query, referer=None): +        headers = {}          linkBase = query.get('linkBaseURL')          if linkBase is not None:              referer = linkBase[0]          if referer is not None: -            req.add_header('Referer', referer) -        webpage = self._download_webpage(req, video_id) +            headers['Referer'] = referer +        webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query)          error_msg = self._html_search_regex(              r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, @@ -355,7 +354,7 @@ class BrightcoveLegacyIE(InfoExtractor):  class BrightcoveNewIE(InfoExtractor):      IE_NAME = 'brightcove:new' -    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>(?:ref:)?\d+)' +    _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'      _TESTS = [{          'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',          'md5': 'c8100925723840d4b0d243f7025703be', @@ -391,6 +390,10 @@ class BrightcoveNewIE(InfoExtractor):          # ref: prefixed video id          'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442',          'only_matching': True, +    }, { +        # non numeric ref: prefixed video id +        'url': 'http://players.brightcove.net/710858724001/default_default/index.html?videoId=ref:event-stream-356', +        'only_matching': True,      }]      @staticmethod @@ -410,8 +413,8 @@ class BrightcoveNewIE(InfoExtractor):          # Look for iframe embeds [1]          for _, url in re.findall( -                r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): -            entries.append(url) +                r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): +            entries.append(url if url.startswith('http') else 'http:' + url)          # Look for embed_in_page embeds [2]          for video_id, account_id, player_id, embed in re.findall( @@ -420,11 +423,11 @@ class BrightcoveNewIE(InfoExtractor):                  # According to [4] data-video-id may be prefixed with ref:                  r'''(?sx)                      <video[^>]+ -                        data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? +                        data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?                      </video>.*?                      <script[^>]+                          src=["\'](?:https?:)?//players\.brightcove\.net/ -                        (\d+)/([\da-f-]+)_([^/]+)/index\.min\.js +                        (\d+)/([\da-f-]+)_([^/]+)/index(?:\.min)?\.js                  ''', webpage):              entries.append(                  'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' @@ -454,24 +457,33 @@ class BrightcoveNewIE(InfoExtractor):                  r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',                  webpage, 'policy key', group='pk') -        req = sanitized_Request( -            'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' -            % (account_id, video_id), -            headers={'Accept': 'application/json;pk=%s' % policy_key}) -        json_data = self._download_json(req, video_id) +        api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) +        try: +            json_data = self._download_json(api_url, video_id, headers={ +                'Accept': 'application/json;pk=%s' % policy_key +            }) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: +                json_data = self._parse_json(e.cause.read().decode(), video_id) +                raise ExtractorError(json_data[0]['message'], expected=True) +            raise          title = json_data['name']          formats = []          for source in json_data.get('sources', []): +            container = source.get('container')              source_type = source.get('type')              src = source.get('src') -            if source_type == 'application/x-mpegURL': +            if source_type == 'application/x-mpegURL' or container == 'M2TS':                  if not src:                      continue                  formats.extend(self._extract_m3u8_formats( -                    src, video_id, 'mp4', entry_protocol='m3u8_native', -                    m3u8_id='hls', fatal=False)) +                    src, video_id, 'mp4', m3u8_id='hls', fatal=False)) +            elif source_type == 'application/dash+xml': +                if not src: +                    continue +                formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))              else:                  streaming_src = source.get('streaming_src')                  stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -479,15 +491,23 @@ class BrightcoveNewIE(InfoExtractor):                      continue                  tbr = float_or_none(source.get('avg_bitrate'), 1000)                  height = int_or_none(source.get('height')) +                width = int_or_none(source.get('width'))                  f = {                      'tbr': tbr, -                    'width': int_or_none(source.get('width')), -                    'height': height,                      'filesize': int_or_none(source.get('size')), -                    'container': source.get('container'), -                    'vcodec': source.get('codec'), -                    'ext': source.get('container').lower(), +                    'container': container, +                    'ext': container.lower(),                  } +                if width == 0 and height == 0: +                    f.update({ +                        'vcodec': 'none', +                    }) +                else: +                    f.update({ +                        'width': width, +                        'height': height, +                        'vcodec': source.get('codec'), +                    })                  def build_format_id(kind):                      format_id = kind diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index 6d9cd8abd..042c4f2f1 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -21,6 +21,10 @@ class CinemassacreIE(InfoExtractor):                  'title': '“Angry Video Game Nerd: The Movie” – Trailer',                  'description': 'md5:fb87405fcb42a331742a0dce2708560b',              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', @@ -31,14 +35,18 @@ class CinemassacreIE(InfoExtractor):                  'upload_date': '20131002',                  'title': 'The Mummy’s Hand (1940)',              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              # Youtube embedded video              'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', -            'md5': 'df4cf8a1dcedaec79a73d96d83b99023', +            'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9',              'info_dict': {                  'id': 'OEVzPCY2T-g', -                'ext': 'mp4', +                'ext': 'webm',                  'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles',                  'upload_date': '20061207',                  'uploader': 'Cinemassacre', @@ -49,12 +57,12 @@ class CinemassacreIE(InfoExtractor):          {              # Youtube embedded video              'url': 'http://cinemassacre.com/2006/09/01/mckids/', -            'md5': '6eb30961fa795fedc750eac4881ad2e1', +            'md5': '7393c4e0f54602ad110c793eb7a6513a',              'info_dict': {                  'id': 'FnxsNhuikpo', -                'ext': 'mp4', +                'ext': 'webm',                  'upload_date': '20060901', -                'uploader': 'Cinemassacre Extras', +                'uploader': 'Cinemassacre Extra',                  'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53',                  'uploader_id': 'Cinemassacre',                  'title': 'AVGN: McKids', @@ -69,7 +77,11 @@ class CinemassacreIE(InfoExtractor):                  'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!',                  'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays',                  'upload_date': '20150525', -            } +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          }      ] diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5c3908f72..3cf0bf95b 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -51,9 +51,7 @@ class CNETIE(ThePlatformIE):              uploader = None              uploader_id = None -        mpx_account = data['config']['uvpConfig']['default']['mpx_account'] - -        metadata = self.get_metadata('%s/%s' % (mpx_account, list(vdata['files'].values())[0]), video_id) +        metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id)          description = vdata.get('description') or metadata.get('description')          duration = int_or_none(vdata.get('duration')) or metadata.get('duration') @@ -62,7 +60,7 @@ class CNETIE(ThePlatformIE):          for (fkey, vid) in vdata['files'].items():              if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:                  continue -            release_url = 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true' % (mpx_account, vid) +            release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid              if fkey == 'hds':                  release_url += '&manifest=f4m'              tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 402f2f436..770105a5b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,13 +15,14 @@ import math  from ..compat import (      compat_cookiejar,      compat_cookies, +    compat_etree_fromstring,      compat_getpass,      compat_http_client, +    compat_os_name, +    compat_str,      compat_urllib_error,      compat_urllib_parse,      compat_urlparse, -    compat_str, -    compat_etree_fromstring,  )  from ..utils import (      NO_DEFAULT, @@ -47,6 +48,7 @@ from ..utils import (      determine_protocol,      parse_duration,      mimetype2ext, +    update_url_query,  ) @@ -104,7 +106,7 @@ class InfoExtractor(object):                      * protocol   The protocol that will be used for the actual                                   download, lower-case.                                   "http", "https", "rtsp", "rtmp", "rtmpe", -                                 "m3u8", or "m3u8_native". +                                 "m3u8", "m3u8_native" or "http_dash_segments".                      * preference Order number of this format. If this field is                                   present and not None, the formats get sorted                                   by this field, regardless of all other values. @@ -344,7 +346,7 @@ class InfoExtractor(object):      def IE_NAME(self):          return compat_str(type(self).__name__[:-2]) -    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): +    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):          """ Returns the response handle """          if note is None:              self.report_download_webpage(video_id) @@ -353,6 +355,12 @@ class InfoExtractor(object):                  self.to_screen('%s' % (note,))              else:                  self.to_screen('%s: %s' % (video_id, note)) +        # data, headers and query params will be ignored for `Request` objects +        if isinstance(url_or_request, compat_str): +            if query: +                url_or_request = update_url_query(url_or_request, query) +            if data or headers: +                url_or_request = sanitized_Request(url_or_request, data, headers or {})          try:              return self._downloader.urlopen(url_or_request)          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -368,13 +376,13 @@ class InfoExtractor(object):                  self._downloader.report_warning(errmsg)                  return False -    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): +    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):          """ Returns a tuple (page content as string, URL handle) """          # Strip hashes from the URL (#1038)          if isinstance(url_or_request, (compat_str, str)):              url_or_request = url_or_request.partition('#')[0] -        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) +        urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)          if urlh is False:              assert not fatal              return False @@ -427,7 +435,7 @@ class InfoExtractor(object):              self.to_screen('Saving request to ' + filename)              # Working around MAX_PATH limitation on Windows (see              # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) -            if os.name == 'nt': +            if compat_os_name == 'nt':                  absfilepath = os.path.abspath(filename)                  if len(absfilepath) > 259:                      filename = '\\\\?\\' + absfilepath @@ -461,13 +469,13 @@ class InfoExtractor(object):          return content -    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): +    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):          """ Returns the data of the page as a string """          success = False          try_count = 0          while success is False:              try: -                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) +                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)                  success = True              except compat_http_client.IncompleteRead as e:                  try_count += 1 @@ -482,10 +490,10 @@ class InfoExtractor(object):      def _download_xml(self, url_or_request, video_id,                        note='Downloading XML', errnote='Unable to download XML', -                      transform_source=None, fatal=True, encoding=None): +                      transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):          """Return the xml as an xml.etree.ElementTree.Element"""          xml_string = self._download_webpage( -            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) +            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)          if xml_string is False:              return xml_string          if transform_source: @@ -496,10 +504,10 @@ class InfoExtractor(object):                         note='Downloading JSON metadata',                         errnote='Unable to download JSON metadata',                         transform_source=None, -                       fatal=True, encoding=None): +                       fatal=True, encoding=None, data=None, headers=None, query=None):          json_string = self._download_webpage(              url_or_request, video_id, note, errnote, fatal=fatal, -            encoding=encoding) +            encoding=encoding, data=data, headers=headers, query=query)          if (not fatal) and json_string is False:              return None          return self._parse_json( @@ -596,7 +604,7 @@ class InfoExtractor(object):                  if mobj:                      break -        if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): +        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():              _name = '\033[0;34m%s\033[0m' % name          else:              _name = name @@ -854,6 +862,7 @@ class InfoExtractor(object):              proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1              if f.get('vcodec') == 'none':  # audio only +                preference -= 50                  if self._downloader.params.get('prefer_free_formats'):                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']                  else: @@ -864,6 +873,8 @@ class InfoExtractor(object):                  except ValueError:                      audio_ext_preference = -1              else: +                if f.get('acodec') == 'none':  # video only +                    preference -= 40                  if self._downloader.params.get('prefer_free_formats'):                      ORDER = ['flv', 'mp4', 'webm']                  else: @@ -965,6 +976,13 @@ class InfoExtractor(object):          if manifest is False:              return [] +        return self._parse_f4m_formats( +            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, +            transform_source=transform_source, fatal=fatal) + +    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, +                           transform_source=lambda s: fix_xml_ampersands(s).strip(), +                           fatal=True):          formats = []          manifest_version = '1.0'          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') @@ -990,7 +1008,8 @@ class InfoExtractor(object):                  # bitrate in f4m downloader                  if determine_ext(manifest_url) == 'f4m':                      formats.extend(self._extract_f4m_formats( -                        manifest_url, video_id, preference, f4m_id, fatal=fatal)) +                        manifest_url, video_id, preference=preference, f4m_id=f4m_id, +                        transform_source=transform_source, fatal=fatal))                      continue              tbr = int_or_none(media_el.attrib.get('bitrate'))              formats.append({ @@ -1139,8 +1158,8 @@ class InfoExtractor(object):                  out.append('{%s}%s' % (namespace, c))          return '/'.join(out) -    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): -        smil = self._download_smil(smil_url, video_id, fatal=fatal) +    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): +        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)          if smil is False:              assert not fatal @@ -1157,10 +1176,10 @@ class InfoExtractor(object):              return {}          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) -    def _download_smil(self, smil_url, video_id, fatal=True): +    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):          return self._download_xml(              smil_url, video_id, 'Downloading SMIL file', -            'Unable to download SMIL file', fatal=fatal) +            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)      def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):          namespace = self._parse_smil_namespace(smil) @@ -1446,8 +1465,9 @@ class InfoExtractor(object):                          continue                      representation_attrib = adaptation_set.attrib.copy()                      representation_attrib.update(representation.attrib) -                    mime_type = representation_attrib.get('mimeType') -                    content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') +                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory +                    mime_type = representation_attrib['mimeType'] +                    content_type = mime_type.split('/')[0]                      if content_type == 'text':                          # TODO implement WebVTT downloading                          pass @@ -1470,6 +1490,7 @@ class InfoExtractor(object):                          f = {                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,                              'url': base_url, +                            'ext': mimetype2ext(mime_type),                              'width': int_or_none(representation_attrib.get('width')),                              'height': int_or_none(representation_attrib.get('height')),                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 373b3b4b4..bdc768c78 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):              'display_id': 'iseven',              'ext': 'flv',              'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'description': 'md5:c93d6692dde6fe33809a46edcbecca44', +            'description': 'md5:f34981259a03e980a3c6404190a3ed61',              'thumbnail': 're:^https?://.*\.jpg$',              'uploader': '7师傅',              'uploader_id': '431925', @@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        },      }, {          'url': 'http://www.douyutv.com/85982',          'info_dict': { @@ -42,7 +42,24 @@ class DouyuTVIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        }, +        'skip': 'Romm not found', +    }, { +        'url': 'http://www.douyutv.com/17732', +        'info_dict': { +            'id': '17732', +            'display_id': '17732', +            'ext': 'flv', +            'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:f34981259a03e980a3c6404190a3ed61', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': '7师傅', +            'uploader_id': '431925', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        },      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py new file mode 100644 index 000000000..b6c985547 --- /dev/null +++ b/youtube_dl/extractor/dw.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): +    IE_NAME = 'dw' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)' +    _TESTS = [{ +        # video +        'url': 'http://www.dw.com/en/intelligent-light/av-19112290', +        'md5': '7372046e1815c5a534b43f3c3c36e6e9', +        'info_dict': { +            'id': '19112290', +            'ext': 'mp4', +            'title': 'Intelligent light', +            'description': 'md5:90e00d5881719f2a6a5827cb74985af1', +            'upload_date': '20160311', +        } +    }, { +        # audio +        'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', +        'md5': '2814c9a1321c3a51f8a7aeb067a360dd', +        'info_dict': { +            'id': '19111941', +            'ext': 'mp3', +            'title': 'WorldLink: My business', +            'description': 'md5:bc9ca6e4e063361e21c920c53af12405', +            'upload_date': '20160311', +        } +    }] + +    def _real_extract(self, url): +        media_id = self._match_id(url) +        webpage = self._download_webpage(url, media_id) +        hidden_inputs = self._hidden_inputs(webpage) +        title = hidden_inputs['media_title'] + +        formats = [] +        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': +            formats = self._extract_smil_formats( +                'http://www.dw.com/smil/v-%s' % media_id, media_id, +                transform_source=lambda s: s.replace( +                    'rtmp://tv-od.dw.de/flash/', +                    'http://tv-download.dw.de/dwtv_video/flv/')) +        else: +            formats = [{'url': hidden_inputs['file_name']}] + +        return { +            'id': media_id, +            'title': title, +            'description': self._og_search_description(webpage), +            'thumbnail': hidden_inputs.get('preview_image'), +            'duration': int_or_none(hidden_inputs.get('file_duration')), +            'upload_date': hidden_inputs.get('display_date'), +            'formats': formats, +        } + + +class DWArticleIE(InfoExtractor): +    IE_NAME = 'dw:article' +    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P<id>\d+)' +    _TEST = { +        'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', +        'md5': '8ca657f9d068bbef74d6fc38b97fc869', +        'info_dict': { +            'id': '19105868', +            'ext': 'mp4', +            'title': 'The harsh life of refugees in Idomeni', +            'description': 'md5:196015cc7e48ebf474db9399420043c7', +            'upload_date': '20160310', +        } +    } + +    def _real_extract(self, url): +        article_id = self._match_id(url) +        webpage = self._download_webpage(url, article_id) +        hidden_inputs = self._hidden_inputs(webpage) +        media_id = hidden_inputs['media_id'] +        media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') +        media_url = compat_urlparse.urljoin(url, media_path) +        return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 00a69e631..8c725a4e6 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor):      _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'      IE_DESC = 'El País' -    _TEST = { +    _TESTS = [{          'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',          'md5': '98406f301f19562170ec071b83433d55',          'info_dict': { @@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor):              'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',              'upload_date': '20140206',          } -    } +    }, { +        'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t', +        'md5': '3bd5b09509f3519d7d9e763179b013de', +        'info_dict': { +            'id': '1456340311_668921', +            'ext': 'mp4', +            'title': 'Cómo hacer el mejor café con cafetera italiana', +            'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', +            'upload_date': '20160303', +        } +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          prefix = self._html_search_regex( -            r'var url_cache = "([^"]+)";', webpage, 'URL prefix') +            r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')          video_suffix = self._search_regex( -            r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL') +            r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')          video_url = prefix + video_suffix          thumbnail_suffix = self._search_regex( -            r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL', -            fatal=False) +            r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", +            webpage, 'thumbnail URL', fatal=False)          thumbnail = (              None if thumbnail_suffix is None              else prefix + thumbnail_suffix)          title = self._html_search_regex( -            '<h2 class="entry-header entry-title.*?>(.*?)</h2>', +            (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title', +             r'<h2 class="entry-header entry-title.*?>(.*?)</h2>'),              webpage, 'title') -        date_str = self._search_regex( +        upload_date = unified_strdate(self._search_regex(              r'<p class="date-header date-int updated"\s+title="([^"]+)">', -            webpage, 'upload date', fatal=False) -        upload_date = (None if date_str is None else unified_strdate(date_str)) +            webpage, 'upload date', default=None) or self._html_search_meta( +            'datePublished', webpage, 'timestamp'))          return {              'id': video_id, diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e4180701d..e5e57d485 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -1,21 +1,13 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import ( -    url_basename, -)  class EngadgetIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://www.engadget.com/ -        (?:video(?:/5min)?/(?P<id>\d+)| -            [\d/]+/.*?) -        ''' +    _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'      _TEST = { -        'url': 'http://www.engadget.com/video/5min/518153925/', +        'url': 'http://www.engadget.com/video/518153925/',          'md5': 'c6820d4828a5064447a4d9fc73f312c9',          'info_dict': {              'id': '518153925', @@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) - -        if video_id is not None: -            return self.url_result('5min:%s' % video_id) -        else: -            title = url_basename(url) -            webpage = self._download_webpage(url, title) -            ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage) -            return { -                '_type': 'playlist', -                'title': title, -                'entries': [self.url_result('5min:%s' % vid) for vid in ids] -            } +        return self.url_result('5min:%s' % video_id) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 6c6c3b1bd..f5bbd39d2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -37,7 +37,9 @@ class FacebookIE(InfoExtractor):                                  video/embed|                                  story\.php                              )\?(?:.*?)(?:v|video_id|story_fbid)=| -                            [^/]+/videos/(?:[^/]+/)? +                            [^/]+/videos/(?:[^/]+/)?| +                            [^/]+/posts/| +                            groups/[^/]+/permalink/                          )|                      facebook:                  ) @@ -50,6 +52,8 @@ class FacebookIE(InfoExtractor):      _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' +    _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' +      _TESTS = [{          'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',          'md5': '6a40d33c0eccbb1af76cf0485a052659', @@ -82,6 +86,33 @@ class FacebookIE(InfoExtractor):              'uploader': 'Demy de Zeeuw',          },      }, { +        'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', +        'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', +        'info_dict': { +            'id': '544765982287235', +            'ext': 'mp4', +            'title': '"What are you doing running in the snow?"', +            'uploader': 'FailArmy', +        } +    }, { +        'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', +        'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3', +        'info_dict': { +            'id': '1035862816472149', +            'ext': 'mp4', +            'title': 'What the Flock Is Going On In New Zealand  Credit: ViralHog', +            'uploader': 'S. Saint', +        }, +    }, { +        'note': 'swf params escaped', +        'url': 'https://www.facebook.com/barackobama/posts/10153664894881749', +        'md5': '97ba073838964d12c70566e0085c2b91', +        'info_dict': { +            'id': '10153664894881749', +            'ext': 'mp4', +            'title': 'Facebook video #10153664894881749', +        }, +    }, {          'url': 'https://www.facebook.com/video.php?v=10204634152394104',          'only_matching': True,      }, { @@ -94,7 +125,7 @@ class FacebookIE(InfoExtractor):          'url': 'facebook:544765982287235',          'only_matching': True,      }, { -        'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903', +        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',          'only_matching': True,      }] @@ -164,19 +195,19 @@ class FacebookIE(InfoExtractor):      def _real_initialize(self):          self._login() -    def _real_extract(self, url): -        video_id = self._match_id(url) -        req = sanitized_Request('https://www.facebook.com/video/video.php?v=%s' % video_id) +    def _extract_from_url(self, url, video_id, fatal_if_no_video=True): +        req = sanitized_Request(url)          req.add_header('User-Agent', self._CHROME_USER_AGENT)          webpage = self._download_webpage(req, video_id)          video_data = None -        BEFORE = '{swf.addParam(param[0], param[1]);});\n' +        BEFORE = '{swf.addParam(param[0], param[1]);});'          AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' -        m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage) +        m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)          if m: -            data = dict(json.loads(m.group(1))) +            swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') +            data = dict(json.loads(swf_params))              params_raw = compat_urllib_parse_unquote(data['params'])              video_data = json.loads(params_raw)['video_data'] @@ -189,13 +220,15 @@ class FacebookIE(InfoExtractor):          if not video_data:              server_js_data = self._parse_json(self._search_regex( -                r'handleServerJS\(({.+})\);', webpage, 'server js data'), video_id) +                r'handleServerJS\(({.+})\);', webpage, 'server js data', default='{}'), video_id)              for item in server_js_data.get('instances', []):                  if item[1][0] == 'VideoConfig':                      video_data = video_data_list2dict(item[2][0]['videoData'])                      break          if not video_data: +            if not fatal_if_no_video: +                return webpage, False              m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)              if m_msg is not None:                  raise ExtractorError( @@ -241,39 +274,36 @@ class FacebookIE(InfoExtractor):              video_title = 'Facebook video #%s' % video_id          uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) -        return { +        info_dict = {              'id': video_id,              'title': video_title,              'formats': formats,              'uploader': uploader,          } - -class FacebookPostIE(InfoExtractor): -    IE_NAME = 'facebook:post' -    _VALID_URL = r'https?://(?:\w+\.)?facebook\.com/[^/]+/posts/(?P<id>\d+)' -    _TEST = { -        'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', -        'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6', -        'info_dict': { -            'id': '544765982287235', -            'ext': 'mp4', -            'title': '"What are you doing running in the snow?"', -            'uploader': 'FailArmy', -        } -    } +        return webpage, info_dict      def _real_extract(self, url): -        post_id = self._match_id(url) +        video_id = self._match_id(url) + +        real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url +        webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) -        webpage = self._download_webpage(url, post_id) +        if info_dict: +            return info_dict -        entries = [ -            self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) -            for video_id in self._parse_json( -                self._search_regex( -                    r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', -                    webpage, 'video ids', group='ids'), -                post_id)] +        if '/posts/' in url: +            entries = [ +                self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) +                for vid in self._parse_json( +                    self._search_regex( +                        r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', +                        webpage, 'video ids', group='ids'), +                    video_id)] -        return self.playlist_result(entries, post_id) +            return self.playlist_result(entries, video_id) +        else: +            _, info_dict = self._extract_from_url( +                self._VIDEO_PAGE_TEMPLATE % video_id, +                video_id, fatal_if_no_video=True) +            return info_dict diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 2955965d9..67d50a386 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,5 +1,7 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, @@ -16,12 +18,7 @@ from ..utils import (  class FiveMinIE(InfoExtractor):      IE_NAME = '5min' -    _VALID_URL = r'''(?x) -        (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=| -            https?://(?:(?:massively|www)\.)?joystiq\.com/video/| -            5min:) -        (?P<id>\d+) -        ''' +    _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'      _TESTS = [          { @@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor):                  'title': 'How to Make a Next-Level Fruit Salad',                  'duration': 184,              }, +            'skip': 'no longer available',          },      ]      _ERRORS = { @@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor):      }      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        sid = mobj.group('sid') + +        if mobj.group('query'): +            qs = compat_parse_qs(mobj.group('query')) +            if not qs.get('playList'): +                raise ExtractorError('Invalid URL', expected=True) +            video_id = qs['playList'][0] +            if qs.get('sid'): +                sid = qs['sid'][0] +          embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id -        embed_page = self._download_webpage(embed_url, video_id, -                                            'Downloading embed page') -        sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') -        query = compat_urllib_parse.urlencode({ -            'func': 'GetResults', -            'playlist': video_id, -            'sid': sid, -            'isPlayerSeed': 'true', -            'url': embed_url, -        }) +        if not sid: +            embed_page = self._download_webpage(embed_url, video_id, +                                                'Downloading embed page') +            sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') +          response = self._download_json( -            'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, +            'https://syn.5min.com/handlers/SenseHandler.ashx?' + +            compat_urllib_parse.urlencode({ +                'func': 'GetResults', +                'playlist': video_id, +                'sid': sid, +                'isPlayerSeed': 'true', +                'url': embed_url, +            }),              video_id)          if not response['success']:              raise ExtractorError( @@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor):          parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(              compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])          for rendition in info['Renditions']: -            if rendition['RenditionType'] == 'm3u8': -                formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) -            elif rendition['RenditionType'] == 'aac': +            if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':                  continue              else:                  rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 318ac013d..1dc50318c 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -36,6 +36,10 @@ class FoxNewsIE(AMPIE):                  # 'upload_date': '20141204',                  'thumbnail': 're:^https?://.*\.jpg$',              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index c210177f7..1477708bb 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -14,7 +14,7 @@ class FreespeechIE(InfoExtractor):          'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0',          'info_dict': {              'id': 'poKsVCZ64uU', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'Obama, Romney Campaign in Colorado Ahead of Debate',              'description': 'Obama, Romney Campaign in Colorado Ahead of Debate',              'uploader': 'freespeechtv', diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py index 25870c131..a66e309de 100644 --- a/youtube_dl/extractor/gameinformer.py +++ b/youtube_dl/extractor/gameinformer.py @@ -2,42 +2,27 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none  class GameInformerIE(InfoExtractor):      _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P<id>.+)\.aspx'      _TEST = {          'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', +        'md5': '292f26da1ab4beb4c9099f1304d2b071',          'info_dict': {              'id': '4515472681001', -            'ext': 'm3u8', +            'ext': 'mp4',              'title': 'Replay - Animal Crossing',              'description': 'md5:2e211891b215c85d061adc7a4dd2d930', -            'timestamp': 1443457610706, -        }, -        'params': { -            # m3u8 download -            'skip_download': True, +            'timestamp': 1443457610, +            'upload_date': '20150928', +            'uploader_id': '694940074001',          },      } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/694940074001/default_default/index.html?videoId=%s'      def _real_extract(self, url):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) - -        bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url') -        json_data = self._download_json( -            bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions', -            display_id) - -        return { -            'id': compat_str(json_data['id']), -            'display_id': display_id, -            'url': json_data['IOSRenditions'][0]['url'], -            'title': json_data['name'], -            'description': json_data.get('shortDescription'), -            'timestamp': int_or_none(json_data.get('publishedDate')), -            'duration': int_or_none(json_data.get('length')), -        } +        brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id') +        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca745ae41..8121f04a5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1242,28 +1242,34 @@ class GenericIE(InfoExtractor):              full_response = self._request_webpage(request, video_id)              head_response = full_response +        info_dict = { +            'id': video_id, +            'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), +        } +          # Check for direct link to a video          content_type = head_response.headers.get('Content-Type', '')          m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)          if m:              upload_date = unified_strdate(                  head_response.headers.get('Last-Modified')) -            formats = [] -            if m.group('format_id').endswith('mpegurl'): +            format_id = m.group('format_id') +            if format_id.endswith('mpegurl'):                  formats = self._extract_m3u8_formats(url, video_id, 'mp4') +            elif format_id == 'f4m': +                formats = self._extract_f4m_formats(url, video_id)              else:                  formats = [{                      'format_id': m.group('format_id'),                      'url': url,                      'vcodec': 'none' if m.group('type') == 'audio' else None                  }] -            return { -                'id': video_id, -                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), +            info_dict.update({                  'direct': True,                  'formats': formats,                  'upload_date': upload_date, -            } +            }) +            return info_dict          if not self._downloader.params.get('test', False) and not is_intentional:              force = self._downloader.params.get('force_generic_extractor', False) @@ -1291,13 +1297,12 @@ class GenericIE(InfoExtractor):                  'URL could be a direct video link, returning it as such.')              upload_date = unified_strdate(                  head_response.headers.get('Last-Modified')) -            return { -                'id': video_id, -                'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), +            info_dict.update({                  'direct': True,                  'url': url,                  'upload_date': upload_date, -            } +            }) +            return info_dict          webpage = self._webpage_read_content(              full_response, url, video_id, prefix=first_bytes) @@ -1314,12 +1319,12 @@ class GenericIE(InfoExtractor):              elif doc.tag == '{http://xspf.org/ns/0/}playlist':                  return self.playlist_result(self._parse_xspf(doc, video_id), video_id)              elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): -                return { -                    'id': video_id, -                    'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), -                    'formats': self._parse_mpd_formats( -                        doc, video_id, mpd_base_url=url.rpartition('/')[0]), -                } +                info_dict['formats'] = self._parse_mpd_formats( +                    doc, video_id, mpd_base_url=url.rpartition('/')[0]) +                return info_dict +            elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): +                info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) +                return info_dict          except compat_xml_parse_error:              pass @@ -1985,6 +1990,8 @@ class GenericIE(InfoExtractor):                  entry_info_dict['formats'] = self._extract_m3u8_formats(video_url, video_id, ext='mp4')              elif ext == 'mpd':                  entry_info_dict['formats'] = self._extract_mpd_formats(video_url, video_id) +            elif ext == 'f4m': +                entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id)              else:                  entry_info_dict['url'] = video_url diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37be34091..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -10,8 +10,8 @@ from ..utils import (  class GoogleDriveIE(InfoExtractor): -    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28})' -    _TEST = { +    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' +    _TESTS = [{          'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',          'md5': '881f7700aec4f538571fa1e0eed4a7b6',          'info_dict': { @@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor):              'title': 'Big Buck Bunny.mp4',              'duration': 46,          } -    } +    }, { +        # video id is longer than 28 characters +        'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', +        'only_matching': True, +    }]      _FORMATS_EXT = {          '5': 'flv',          '6': 'flv', @@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor):      @staticmethod      def _extract_url(webpage):          mobj = re.search( -            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', +            r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',              webpage)          if mobj:              return 'https://drive.google.com/file/d/%s' % mobj.group('id') diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 12fb5e8e1..9622f198a 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor):              'url': self._proto_relative_url(thumbnail)          } for thumbnail in video.get('thumbnails', [])] -        tags = [tag['title'] for tag in video.get('tags', [])] +        tags = [tag['title'] for tag in video.get('tags') or []]          return {              'id': video.get('id') or video_id, diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index d3bee3a19..e7c0cb3f6 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -501,7 +501,7 @@ class IqiyiIE(InfoExtractor):      def get_enc_key(self, video_id):          # TODO: automatic key extraction          # last update at 2016-01-22 for Zombie::bite -        enc_key = '6ab6d0280511493ba85594779759d4ed' +        enc_key = '8ed797d224d043e7ac23d95b70227d32'          return enc_key      def _extract_playlist(self, webpage): diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index eef7daa29..137db873c 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor):          webpage = self._download_webpage(url, title)          title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)          config_url = self._html_search_regex( -            r'data-src="(/contenu/medias/video.php.*?)"', +            r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"',              webpage, 'config URL')          config_url = 'http://www.jeuxvideo.com' + config_url diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ccbc39c66..44d7c84a1 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -8,6 +8,7 @@ from .common import InfoExtractor  from ..compat import (      compat_urllib_parse,      compat_urlparse, +    compat_parse_qs,  )  from ..utils import (      clean_html, @@ -20,21 +21,17 @@ from ..utils import (  class KalturaIE(InfoExtractor):      _VALID_URL = r'''(?x)                  (?: -                    kaltura:(?P<partner_id_s>\d+):(?P<id_s>[0-9a-z_]+)| +                    kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)|                      https?://                          (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/                          (?:                              (?:                                  # flash player -                                index\.php/kwidget/ -                                (?:[^/]+/)*?wid/_(?P<partner_id>\d+)/ -                                (?:[^/]+/)*?entry_id/(?P<id>[0-9a-z_]+)| +                                index\.php/kwidget|                                  # html5 player -                                html5/html5lib/ -                                (?:[^/]+/)*?entry_id/(?P<id_html5>[0-9a-z_]+) -                                .*\?.*\bwid=_(?P<partner_id_html5>\d+) +                                html5/html5lib/[^/]+/mwEmbedFrame\.php                              ) -                        ) +                        )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?                  )                  '''      _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' @@ -127,10 +124,41 @@ class KalturaIE(InfoExtractor):          url, smuggled_data = unsmuggle_url(url, {})          mobj = re.match(self._VALID_URL, url) -        partner_id = mobj.group('partner_id_s') or mobj.group('partner_id') or mobj.group('partner_id_html5') -        entry_id = mobj.group('id_s') or mobj.group('id') or mobj.group('id_html5') - -        info, flavor_assets = self._get_video_info(entry_id, partner_id) +        partner_id, entry_id = mobj.group('partner_id', 'id') +        ks = None +        if partner_id and entry_id: +            info, flavor_assets = self._get_video_info(entry_id, partner_id) +        else: +            path, query = mobj.group('path', 'query') +            if not path and not query: +                raise ExtractorError('Invalid URL', expected=True) +            params = {} +            if query: +                params = compat_parse_qs(query) +            if path: +                splitted_path = path.split('/') +                params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) +            if 'wid' in params: +                partner_id = params['wid'][0][1:] +            elif 'p' in params: +                partner_id = params['p'][0] +            else: +                raise ExtractorError('Invalid URL', expected=True) +            if 'entry_id' in params: +                entry_id = params['entry_id'][0] +                info, flavor_assets = self._get_video_info(entry_id, partner_id) +            elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: +                reference_id = params['flashvars[referenceId]'][0] +                webpage = self._download_webpage(url, reference_id) +                entry_data = self._parse_json(self._search_regex( +                    r'window\.kalturaIframePackageData\s*=\s*({.*});', +                    webpage, 'kalturaIframePackageData'), +                    reference_id)['entryResult'] +                info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] +                entry_id = info['id'] +            else: +                raise ExtractorError('Invalid URL', expected=True) +            ks = params.get('flashvars[ks]', [None])[0]          source_url = smuggled_data.get('source_url')          if source_url: @@ -140,14 +168,19 @@ class KalturaIE(InfoExtractor):          else:              referrer = None +        def sign_url(unsigned_url): +            if ks: +                unsigned_url += '/ks/%s' % ks +            if referrer: +                unsigned_url += '?referrer=%s' % referrer +            return unsigned_url +          formats = []          for f in flavor_assets:              # Continue if asset is not ready              if f['status'] != 2:                  continue -            video_url = '%s/flavorId/%s' % (info['dataUrl'], f['id']) -            if referrer: -                video_url += '?referrer=%s' % referrer +            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))              formats.append({                  'format_id': '%(fileExt)s-%(bitrate)s' % f,                  'ext': f.get('fileExt'), @@ -160,9 +193,7 @@ class KalturaIE(InfoExtractor):                  'width': int_or_none(f.get('width')),                  'url': video_url,              }) -        m3u8_url = info['dataUrl'].replace('format/url', 'format/applehttp') -        if referrer: -            m3u8_url += '?referrer=%s' % referrer +        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))          formats.extend(self._extract_m3u8_formats(              m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/khanacademy.py b/youtube_dl/extractor/khanacademy.py index 08a671fa8..61739efa7 100644 --- a/youtube_dl/extractor/khanacademy.py +++ b/youtube_dl/extractor/khanacademy.py @@ -14,10 +14,10 @@ class KhanAcademyIE(InfoExtractor):      _TESTS = [{          'url': 'http://www.khanacademy.org/video/one-time-pad', -        'md5': '7021db7f2d47d4fff89b13177cb1e8f4', +        'md5': '7b391cce85e758fb94f763ddc1bbb979',          'info_dict': {              'id': 'one-time-pad', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'The one-time pad',              'description': 'The perfect cipher',              'duration': 176, diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py new file mode 100644 index 000000000..931f34c9b --- /dev/null +++ b/youtube_dl/extractor/kusi.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( +    int_or_none, +    float_or_none, +    timeconvert, +    update_url_query, +    xpath_text, +) + + +class KUSIIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' +    _TESTS = [{ +        'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', +        'md5': 'f926e7684294cf8cb7bdf8858e1b3988', +        'info_dict': { +            'id': '12203019', +            'ext': 'mp4', +            'title': 'Turko Files: Case Closed! & Put On Hold!', +            'duration': 231.0, +            'upload_date': '20160210', +            'timestamp': 1455087571, +            'thumbnail': 're:^https?://.*\.jpg$' +        }, +    }, { +        'url': 'http://kusi.com/video?clipId=12203019', +        'info_dict': { +            'id': '12203019', +            'ext': 'mp4', +            'title': 'Turko Files: Case Closed! & Put On Hold!', +            'duration': 231.0, +            'upload_date': '20160210', +            'timestamp': 1455087571, +            'thumbnail': 're:^https?://.*\.jpg$' +        }, +        'params': { +            'skip_download': True,  # Same as previous one +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        clip_id = mobj.group('clipId') +        video_id = clip_id or mobj.group('path') + +        webpage = self._download_webpage(url, video_id) + +        if clip_id is None: +            video_id = clip_id = self._html_search_regex( +                r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') + +        affiliate_id = self._search_regex( +            r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') + +        # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf +        xml_url = update_url_query('http://www.kusi.com/build.asp', { +            'buildtype': 'buildfeaturexmlrequest', +            'featureType': 'Clip', +            'featureid': clip_id, +            'affiliateno': affiliate_id, +            'clientgroupid': '1', +            'rnd': int(round(random.random() * 1000000)), +        }) + +        doc = self._download_xml(xml_url, video_id) + +        video_title = xpath_text(doc, 'HEADLINE', fatal=True) +        duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) +        description = xpath_text(doc, 'ABSTRACT') +        thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') +        createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + +        quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') +        formats = [] +        for quality in quality_options: +            formats.append({ +                'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), +                'height': int_or_none(quality.attrib.get('height')), +                'width': int_or_none(quality.attrib.get('width')), +                'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_title, +            'description': description, +            'duration': duration, +            'formats': formats, +            'thumbnail': thumbnail, +            'timestamp': createtion_time, +        } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 700e44b63..f94804d06 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -23,7 +23,7 @@ class KuwoBaseIE(InfoExtractor):          {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10}      ] -    def _get_formats(self, song_id): +    def _get_formats(self, song_id, tolerate_ip_deny=False):          formats = []          for file_format in self._FORMATS:              song_url = self._download_webpage( @@ -32,7 +32,7 @@ class KuwoBaseIE(InfoExtractor):                  song_id, note='Download %s url info' % file_format['format'],              ) -            if song_url == 'IPDeny': +            if song_url == 'IPDeny' and not tolerate_ip_deny:                  raise ExtractorError('This song is blocked in this region', expected=True)              if song_url.startswith('http://') or song_url.startswith('https://'): @@ -43,7 +43,12 @@ class KuwoBaseIE(InfoExtractor):                      'preference': file_format['preference'],                      'abr': file_format.get('abr'),                  }) -        self._sort_formats(formats) + +        # XXX _sort_formats fails if there are not formats, while it's not the +        # desired behavior if 'IPDeny' is ignored +        # This check can be removed if https://github.com/rg3/youtube-dl/pull/8051 is merged +        if not tolerate_ip_deny: +            self._sort_formats(formats)          return formats @@ -288,10 +293,16 @@ class KuwoMvIE(KuwoBaseIE):          'url': 'http://www.kuwo.cn/mv/6480076/',          'info_dict': {              'id': '6480076', -            'ext': 'mkv', -            'title': '我们家MV', +            'ext': 'mp4', +            'title': 'My HouseMV',              'creator': '2PM',          }, +        # In this video, music URLs (anti.s) are blocked outside China and +        # USA, while the MV URL (mvurl) is available globally, so force the MV +        # URL for consistent results in different countries +        'params': { +            'format': 'mv', +        },      }      _FORMATS = KuwoBaseIE._FORMATS + [          {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, @@ -313,7 +324,17 @@ class KuwoMvIE(KuwoBaseIE):          else:              raise ExtractorError('Unable to find song or singer names') -        formats = self._get_formats(song_id) +        formats = self._get_formats(song_id, tolerate_ip_deny=True) + +        mv_url = self._download_webpage( +            'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, +            song_id, note='Download %s MV URL' % song_id) +        formats.append({ +            'url': mv_url, +            'format_id': 'mv', +        }) + +        self._sort_formats(formats)          return {              'id': song_id, diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index d0cd3f591..df47e88ba 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -217,14 +217,8 @@ class LePlaylistIE(InfoExtractor):          'playlist_mincount': 96      }, {          'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', -        'info_dict': { -            'id': 'lswjzzjc', -            # The title should be "劲舞青春", but I can't find a simple way to -            # determine the playlist title -            'title': '乐视午间自制剧场', -            'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' -        }, -        'playlist_mincount': 7 +        # This series is moved to http://www.le.com/tv/10005297.html +        'only_matching': True,      }, {          'url': 'http://www.le.com/comic/92063.html',          'only_matching': True, @@ -338,7 +332,7 @@ class LetvCloudIE(InfoExtractor):              formats.append({                  'url': url,                  'ext': determine_ext(decoded_url), -                'format_id': int_or_none(play_url.get('vtype')), +                'format_id': str_or_none(play_url.get('vtype')),                  'format_note': str_or_none(play_url.get('definition')),                  'width': int_or_none(play_url.get('vwidth')),                  'height': int_or_none(play_url.get('vheight')), diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 38fb3d9e4..eada7c299 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -14,6 +14,7 @@ from ..utils import (      xpath_with_ns,      xpath_text,      orderedSet, +    update_url_query,      int_or_none,      float_or_none,      parse_iso8601, @@ -64,7 +65,7 @@ class LivestreamIE(InfoExtractor):      def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):          base_ele = find_xpath_attr(              smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') -        base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/' +        base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'          formats = []          video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) @@ -72,7 +73,10 @@ class LivestreamIE(InfoExtractor):          for vn in video_nodes:              tbr = int_or_none(vn.attrib.get('system-bitrate'), 1000)              furl = ( -                '%s%s?v=3.0.3&fp=WIN%%2014,0,0,145' % (base, vn.attrib['src'])) +                update_url_query(compat_urlparse.urljoin(base, vn.attrib['src']), { +                    'v': '3.0.3', +                    'fp': 'WIN% 14,0,0,145', +                }))              if 'clipBegin' in vn.attrib:                  furl += '&ssek=' + vn.attrib['clipBegin']              formats.append({ diff --git a/youtube_dl/extractor/makerschannel.py b/youtube_dl/extractor/makerschannel.py new file mode 100644 index 000000000..f5d00e61d --- /dev/null +++ b/youtube_dl/extractor/makerschannel.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class MakersChannelIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?makerschannel\.com/.*(?P<id_type>video|production)_id=(?P<id>[0-9]+)' +    _TEST = { +        'url': 'http://makerschannel.com/en/zoomin/community-highlights?video_id=849', +        'md5': '624a512c6969236b5967bf9286345ad1', +        'info_dict': { +            'id': '849', +            'ext': 'mp4', +            'title': 'Landing a bus on a plane is an epic win', +            'uploader': 'ZoomIn', +            'description': 'md5:cd9cca2ea7b69b78be81d07020c97139', +        } +    } + +    def _real_extract(self, url): +        id_type, url_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, url_id) +        video_data = self._html_search_regex(r'<div([^>]+data-%s-id="%s"[^>]+)>' % (id_type, url_id), webpage, 'video data') + +        def extract_data_val(attr, fatal=False): +            return self._html_search_regex(r'data-%s\s*=\s*"([^"]+)"' % attr, video_data, attr, fatal=fatal) +        minoto_id = self._search_regex(r'/id/([a-zA-Z0-9]+)', extract_data_val('video-src', True), 'minoto id') + +        return { +            '_type': 'url_transparent', +            'url': 'minoto:%s' % minoto_id, +            'id': extract_data_val('video-id', True), +            'title': extract_data_val('title', True), +            'description': extract_data_val('description'), +            'thumbnail': extract_data_val('image'), +            'uploader': extract_data_val('channel'), +        } diff --git a/youtube_dl/extractor/minoto.py b/youtube_dl/extractor/minoto.py new file mode 100644 index 000000000..959a10589 --- /dev/null +++ b/youtube_dl/extractor/minoto.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class MinotoIE(InfoExtractor): +    _VALID_URL = r'(?:minoto:|https?://(?:play|iframe|embed)\.minoto-video\.com/(?P<player_id>[0-9]+)/)(?P<id>[a-zA-Z0-9]+)' + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        player_id = mobj.group('player_id') or '1' +        video_id = mobj.group('id') +        video_data = self._download_json('http://play.minoto-video.com/%s/%s.js' % (player_id, video_id), video_id) +        video_metadata = video_data['video-metadata'] +        formats = [] +        for fmt in video_data['video-files']: +            fmt_url = fmt.get('url') +            if not fmt_url: +                continue +            container = fmt.get('container') +            if container == 'hls': +                formats.extend(fmt_url, video_id, 'mp4', m3u8_id='hls', fatal=False) +            else: +                fmt_profile = fmt.get('profile') or {} +                f = { +                    'format_id': fmt_profile.get('name-short'), +                    'format_note': fmt_profile.get('name'), +                    'url': fmt_url, +                    'container': container, +                    'tbr': int_or_none(fmt.get('bitrate')), +                    'filesize': int_or_none(fmt.get('filesize')), +                    'width': int_or_none(fmt.get('width')), +                    'height': int_or_none(fmt.get('height')), +                } +                codecs = fmt.get('codecs') +                if codecs: +                    codecs = codecs.split(',') +                    if len(codecs) == 2: +                        f.update({ +                            'vcodec': codecs[0], +                            'acodec': codecs[1], +                        }) +                formats.append(f) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_metadata['title'], +            'description': video_metadata.get('description'), +            'thumbnail': video_metadata.get('video-poster', {}).get('url'), +            'formats': formats, +        } diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 29ca45778..819c1b90b 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -99,7 +99,7 @@ class OCWMITIE(InfoExtractor):              'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',              'info_dict': {                  'id': 'EObHWIEKGjA', -                'ext': 'mp4', +                'ext': 'webm',                  'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',                  'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',                  'upload_date': '20121109', diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index c2b7ed9ab..101497118 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -7,6 +7,7 @@ from ..compat import compat_urllib_parse_unquote  from ..utils import (      ExtractorError,      HEADRequest, +    parse_count,      str_to_int,  ) @@ -85,8 +86,8 @@ class MixcloudIE(InfoExtractor):          uploader_id = self._search_regex(              r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)          description = self._og_search_description(webpage) -        like_count = str_to_int(self._search_regex( -            r'\bbutton-favorite\b[^>]+m-ajax-toggle-count="([^"]+)"', +        like_count = parse_count(self._search_regex( +            r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)',              webpage, 'like count', fatal=False))          view_count = str_to_int(self._search_regex(              [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index 0ffb44b47..656443c49 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -5,7 +5,9 @@ from .common import InfoExtractor  from ..compat import compat_urllib_parse_unquote  from ..utils import (      int_or_none, +    find_xpath_attr,      xpath_text, +    update_url_query,  ) @@ -46,17 +48,32 @@ class NozIE(InfoExtractor):              doc, './/article/movie/file/duration'))          formats = []          for qnode in doc.findall('.//article/movie/file/qualities/qual'): -            video_node = qnode.find('./html_urls/video_url[@format="video/mp4"]') -            if video_node is None: -                continue  # auto -            formats.append({ -                'url': video_node.text, -                'format_name': xpath_text(qnode, './name'), -                'format_id': xpath_text(qnode, './id'), -                'height': int_or_none(xpath_text(qnode, './height')), -                'width': int_or_none(xpath_text(qnode, './width')), -                'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000), -            }) +            http_url_ele = find_xpath_attr( +                qnode, './html_urls/video_url', 'format', 'video/mp4') +            http_url = http_url_ele.text if http_url_ele is not None else None +            if http_url: +                formats.append({ +                    'url': http_url, +                    'format_name': xpath_text(qnode, './name'), +                    'format_id': '%s-%s' % ('http', xpath_text(qnode, './id')), +                    'height': int_or_none(xpath_text(qnode, './height')), +                    'width': int_or_none(xpath_text(qnode, './width')), +                    'tbr': int_or_none(xpath_text(qnode, './bitrate'), scale=1000), +                }) +            else: +                f4m_url = xpath_text(qnode, 'url_hd2') +                if f4m_url: +                    formats.extend(self._extract_f4m_formats( +                        update_url_query(f4m_url, {'hdcore': '3.4.0'}), +                        video_id, f4m_id='hds', fatal=False)) +                m3u8_url_ele = find_xpath_attr( +                    qnode, './html_urls/video_url', +                    'format', 'application/vnd.apple.mpegurl') +                m3u8_url = m3u8_url_ele.text if m3u8_url_ele is not None else None +                if m3u8_url: +                    formats.extend(self._extract_m3u8_formats( +                        m3u8_url, video_id, 'mp4', 'm3u8_native', +                        m3u8_id='hls', fatal=False))          self._sort_formats(formats)          return { diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 6d5732d45..30a5f2de4 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -12,14 +12,14 @@ class PyvideoIE(InfoExtractor):      _TESTS = [          {              'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', -            'md5': 'de317418c8bc76b1fd8633e4f32acbc6', +            'md5': '520915673e53a5c5d487c36e0c4d85b5',              'info_dict': {                  'id': '24_4WWkSmNo', -                'ext': 'mp4', +                'ext': 'webm',                  'title': 'Become a logging expert in 30 minutes',                  'description': 'md5:9665350d466c67fb5b1598de379021f7',                  'upload_date': '20130320', -                'uploader': 'NextDayVideo', +                'uploader': 'Next Day Video',                  'uploader_id': 'NextDayVideo',              },              'add_ie': ['Youtube'], diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py index b1b8800b9..99979ebe1 100644 --- a/youtube_dl/extractor/revision3.py +++ b/youtube_dl/extractor/revision3.py @@ -19,7 +19,7 @@ class Revision3IE(InfoExtractor):          'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',          'md5': 'd94a72d85d0a829766de4deb8daaf7df',          'info_dict': { -            'id': '73034', +            'id': '71089',              'display_id': 'technobuffalo/5-google-predictions-for-2016',              'ext': 'webm',              'title': '5 Google Predictions for 2016', @@ -31,6 +31,7 @@ class Revision3IE(InfoExtractor):              'uploader_id': 'technobuffalo',          }      }, { +        # Show          'url': 'http://testtube.com/brainstuff',          'info_dict': {              'id': '251', @@ -41,7 +42,7 @@ class Revision3IE(InfoExtractor):      }, {          'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',          'info_dict': { -            'id': '60163', +            'id': '58227',              'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',              'duration': 275,              'ext': 'webm', @@ -52,18 +53,72 @@ class Revision3IE(InfoExtractor):              'uploader': 'DNews',              'uploader_id': 'dnews',          }, +    }, { +        'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', +        'info_dict': { +            'id': '71618', +            'ext': 'mp4', +            'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', +            'title': 'The Israel-Palestine Conflict Explained in Ten Minutes', +            'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start', +            'uploader': 'Editors\' Picks', +            'uploader_id': 'tt-editors-picks', +            'timestamp': 1453309200, +            'upload_date': '20160120', +        }, +        'add_ie': ['Youtube'], +    }, { +        # Tag +        'url': 'http://testtube.com/tech-news', +        'info_dict': { +            'id': '21018', +            'title': 'tech news', +        }, +        'playlist_mincount': 9,      }]      _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'      _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'      def _real_extract(self, url):          domain, display_id = re.match(self._VALID_URL, url).groups() +        site = domain.split('.')[0]          page_info = self._download_json(              self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) -        if page_info['data']['type'] == 'episode': -            episode_data = page_info['data'] -            video_id = compat_str(episode_data['video']['data']['id']) +        page_data = page_info['data'] +        page_type = page_data['type'] +        if page_type in ('episode', 'embed'): +            show_data = page_data['show']['data'] +            page_id = compat_str(page_data['id']) +            video_id = compat_str(page_data['video']['data']['id']) + +            preference = qualities(['mini', 'small', 'medium', 'large']) +            thumbnails = [{ +                'url': image_url, +                'id': image_id, +                'preference': preference(image_id) +            } for image_id, image_url in page_data.get('images', {}).items()] + +            info = { +                'id': page_id, +                'display_id': display_id, +                'title': unescapeHTML(page_data['name']), +                'description': unescapeHTML(page_data.get('summary')), +                'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), +                'author': page_data.get('author'), +                'uploader': show_data.get('name'), +                'uploader_id': show_data.get('slug'), +                'thumbnails': thumbnails, +                'extractor_key': site, +            } + +            if page_type == 'embed': +                info.update({ +                    '_type': 'url_transparent', +                    'url': page_data['video']['data']['embed'], +                }) +                return info +              video_data = self._download_json(                  'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),                  video_id)['items'][0] @@ -84,36 +139,30 @@ class Revision3IE(InfoExtractor):                          })              self._sort_formats(formats) -            preference = qualities(['mini', 'small', 'medium', 'large']) -            thumbnails = [{ -                'url': image_url, -                'id': image_id, -                'preference': preference(image_id) -            } for image_id, image_url in video_data.get('images', {}).items()] - -            return { -                'id': video_id, -                'display_id': display_id, +            info.update({                  'title': unescapeHTML(video_data['title']),                  'description': unescapeHTML(video_data.get('summary')), -                'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '), -                'author': episode_data.get('author'),                  'uploader': video_data.get('show', {}).get('name'),                  'uploader_id': video_data.get('show', {}).get('slug'),                  'duration': int_or_none(video_data.get('duration')), -                'thumbnails': thumbnails,                  'formats': formats, -            } +            }) +            return info          else: -            show_data = page_info['show']['data'] +            list_data = page_info[page_type]['data']              episodes_data = page_info['episodes']['data']              num_episodes = page_info['meta']['totalEpisodes']              processed_episodes = 0              entries = []              page_num = 1              while True: -                entries.extend([self.url_result( -                    'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data]) +                entries.extend([{ +                    '_type': 'url', +                    'url': 'http://%s%s' % (domain, episode['path']), +                    'id': compat_str(episode['id']), +                    'ie_key': 'Revision3', +                    'extractor_key': site, +                } for episode in episodes_data])                  processed_episodes += len(episodes_data)                  if processed_episodes == num_episodes:                      break @@ -123,5 +172,5 @@ class Revision3IE(InfoExtractor):                      display_id)['episodes']['data']              return self.playlist_result( -                entries, compat_str(show_data['id']), -                show_data.get('name'), show_data.get('summary')) +                entries, compat_str(list_data['id']), +                list_data.get('name'), list_data.get('summary')) diff --git a/youtube_dl/extractor/rice.py b/youtube_dl/extractor/rice.py new file mode 100644 index 000000000..f855719ac --- /dev/null +++ b/youtube_dl/extractor/rice.py @@ -0,0 +1,116 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_parse_qs +from ..utils import ( +    xpath_text, +    xpath_element, +    int_or_none, +    parse_iso8601, +    ExtractorError, +) + + +class RICEIE(InfoExtractor): +    _VALID_URL = r'https?://mediahub\.rice\.edu/app/[Pp]ortal/video\.aspx\?(?P<query>.+)' +    _TEST = { +        'url': 'https://mediahub.rice.edu/app/Portal/video.aspx?PortalID=25ffd62c-3d01-4b29-8c70-7c94270efb3e&DestinationID=66bc9434-03bd-4725-b47e-c659d8d809db&ContentID=YEWIvbhb40aqdjMD1ALSqw', +        'md5': '9b83b4a2eead4912dc3b7fac7c449b6a', +        'info_dict': { +            'id': 'YEWIvbhb40aqdjMD1ALSqw', +            'ext': 'mp4', +            'title': 'Active Learning in Archeology', +            'upload_date': '20140616', +            'timestamp': 1402926346, +        } +    } +    _NS = 'http://schemas.datacontract.org/2004/07/ensembleVideo.Data.Service.Contracts.Models.Player.Config' + +    def _real_extract(self, url): +        qs = compat_parse_qs(re.match(self._VALID_URL, url).group('query')) +        if not qs.get('PortalID') or not qs.get('DestinationID') or not qs.get('ContentID'): +            raise ExtractorError('Invalid URL', expected=True) + +        portal_id = qs['PortalID'][0] +        playlist_id = qs['DestinationID'][0] +        content_id = qs['ContentID'][0] + +        content_data = self._download_xml('https://mediahub.rice.edu/api/portal/GetContentTitle', content_id, query={ +            'portalId': portal_id, +            'playlistId': playlist_id, +            'contentId': content_id +        }) +        metadata = xpath_element(content_data, './/metaData', fatal=True) +        title = xpath_text(metadata, 'primaryTitle', fatal=True) +        encodings = xpath_element(content_data, './/encodings', fatal=True) +        player_data = self._download_xml('https://mediahub.rice.edu/api/player/GetPlayerConfig', content_id, query={ +            'temporaryLinkId': xpath_text(encodings, 'temporaryLinkId', fatal=True), +            'contentId': content_id, +        }) + +        common_fmt = {} +        dimensions = xpath_text(encodings, 'dimensions') +        if dimensions: +            wh = dimensions.split('x') +            if len(wh) == 2: +                common_fmt.update({ +                    'width': int_or_none(wh[0]), +                    'height': int_or_none(wh[1]), +                }) + +        formats = [] +        rtsp_path = xpath_text(player_data, self._xpath_ns('RtspPath', self._NS)) +        if rtsp_path: +            fmt = { +                'url': rtsp_path, +                'format_id': 'rtsp', +            } +            fmt.update(common_fmt) +            formats.append(fmt) +        for source in player_data.findall(self._xpath_ns('.//Source', self._NS)): +            video_url = xpath_text(source, self._xpath_ns('File', self._NS)) +            if not video_url: +                continue +            if '.m3u8' in video_url: +                formats.extend(self._extract_m3u8_formats(video_url, content_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) +            else: +                fmt = { +                    'url': video_url, +                    'format_id': video_url.split(':')[0], +                } +                fmt.update(common_fmt) +                rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', video_url) +                if rtmp: +                    fmt.update({ +                        'url': rtmp.group('url'), +                        'play_path': rtmp.group('playpath'), +                        'app': rtmp.group('app'), +                        'ext': 'flv', +                    }) +                formats.append(fmt) +        self._sort_formats(formats) + +        thumbnails = [] +        for content_asset in content_data.findall('.//contentAssets'): +            asset_type = xpath_text(content_asset, 'type') +            if asset_type == 'image': +                image_url = xpath_text(content_asset, 'httpPath') +                if not image_url: +                    continue +                thumbnails.append({ +                    'id': xpath_text(content_asset, 'ID'), +                    'url': image_url, +                }) + +        return { +            'id': content_id, +            'title': title, +            'description': xpath_text(metadata, 'abstract'), +            'duration': int_or_none(xpath_text(metadata, 'duration')), +            'timestamp': parse_iso8601(xpath_text(metadata, 'dateUpdated')), +            'thumbnails': thumbnails, +            'formats': formats, +        } diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 7de7b7273..256396bb8 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -4,14 +4,13 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE  from ..utils import (      ExtractorError,      sanitized_Request, -    smuggle_url,      std_headers,      urlencode_postdata, +    update_url_query,  ) @@ -20,28 +19,30 @@ class SafariBaseIE(InfoExtractor):      _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'      _NETRC_MACHINE = 'safari' -    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' +    _API_BASE = 'https://www.safaribooksonline.com/api/v1'      _API_FORMAT = 'json'      LOGGED_IN = False      def _real_initialize(self): -        # We only need to log in once for courses or individual videos -        if not self.LOGGED_IN: -            self._login() -            SafariBaseIE.LOGGED_IN = True +        self._login()      def _login(self): +        # We only need to log in once for courses or individual videos +        if self.LOGGED_IN: +            return +          (username, password) = self._get_login_info()          if username is None: -            self.raise_login_required('safaribooksonline.com account is required') +            return -        headers = std_headers +        headers = std_headers.copy()          if 'Referer' not in headers:              headers['Referer'] = self._LOGIN_URL +        login_page_request = sanitized_Request(self._LOGIN_URL, headers=headers)          login_page = self._download_webpage( -            self._LOGIN_URL, None, +            login_page_request, None,              'Downloading login form')          csrf = self._html_search_regex( @@ -66,6 +67,8 @@ class SafariBaseIE(InfoExtractor):                  'Login failed; make sure your credentials are correct and try again.',                  expected=True) +        SafariBaseIE.LOGGED_IN = True +          self.to_screen('Login successful') @@ -85,13 +88,15 @@ class SafariIE(SafariBaseIE):      _TESTS = [{          'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', -        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592', +        'md5': 'dcc5a425e79f2564148652616af1f2a3',          'info_dict': { -            'id': '2842601850001', +            'id': '0_qbqx90ic',              'ext': 'mp4', -            'title': 'Introduction', +            'title': 'Introduction to Hadoop Fundamentals LiveLessons', +            'timestamp': 1437758058, +            'upload_date': '20150724', +            'uploader_id': 'stork',          }, -        'skip': 'Requires safaribooksonline account credentials',      }, {          'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',          'only_matching': True, @@ -106,15 +111,30 @@ class SafariIE(SafariBaseIE):          course_id = mobj.group('course_id')          part = mobj.group('part') -        webpage = self._download_webpage( -            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part), -            part) +        webpage = self._download_webpage(url, '%s/%s' % (course_id, part)) +        reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id') +        partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id') +        ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id') + +        query = { +            'wid': '_%s' % partner_id, +            'uiconf_id': ui_id, +            'flashvars[referenceId]': reference_id, +        } -        bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) -        if not bc_url: -            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True) +        if self.LOGGED_IN: +            kaltura_session = self._download_json( +                '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), +                course_id, 'Downloading kaltura session JSON', +                'Unable to download kaltura session JSON', fatal=False) +            if kaltura_session: +                session = kaltura_session.get('session') +                if session: +                    query['flashvars[ks]'] = session -        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'BrightcoveLegacy') +        return self.url_result(update_url_query( +            'https://cdnapisec.kaltura.com/html5/html5lib/v2.37.1/mwEmbedFrame.php', query), +            'Kaltura')  class SafariCourseIE(SafariBaseIE): @@ -140,7 +160,7 @@ class SafariCourseIE(SafariBaseIE):          course_id = self._match_id(url)          course_json = self._download_json( -            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT), +            '%s/book/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),              course_id, 'Downloading course JSON')          if 'chapters' not in course_json: diff --git a/youtube_dl/extractor/sexu.py b/youtube_dl/extractor/sexu.py index 6365a8779..a99b2a8e7 100644 --- a/youtube_dl/extractor/sexu.py +++ b/youtube_dl/extractor/sexu.py @@ -1,7 +1,5 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor @@ -14,7 +12,7 @@ class SexuIE(InfoExtractor):              'id': '961791',              'ext': 'mp4',              'title': 'md5:4d05a19a5fc049a63dbbaf05fb71d91b', -            'description': 'md5:c5ed8625eb386855d5a7967bd7b77a54', +            'description': 'md5:2b75327061310a3afb3fbd7d09e2e403',              'categories': list,  # NSFW              'thumbnail': 're:https?://.*\.jpg$',              'age_limit': 18, @@ -25,13 +23,18 @@ class SexuIE(InfoExtractor):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id) -        quality_arr = self._search_regex( -            r'sources:\s*\[([^\]]+)\]', webpage, 'forrmat string') +        jwvideo = self._parse_json( +            self._search_regex(r'\.setup\(\s*({.+?})\s*\);', webpage, 'jwvideo'), +            video_id) + +        sources = jwvideo['sources'] +          formats = [{ -            'url': fmt[0].replace('\\', ''), -            'format_id': fmt[1], -            'height': int(fmt[1][:3]), -        } for fmt in re.findall(r'"file":"([^"]+)","label":"([^"]+)"', quality_arr)] +            'url': source['file'].replace('\\', ''), +            'format_id': source.get('label'), +            'height': self._search_regex( +                r'^(\d+)[pP]', source.get('label', ''), 'height', default=None), +        } for source in sources if source.get('file')]          self._sort_formats(formats)          title = self._html_search_regex( @@ -40,9 +43,7 @@ class SexuIE(InfoExtractor):          description = self._html_search_meta(              'description', webpage, 'description') -        thumbnail = self._html_search_regex( -            r'image:\s*"([^"]+)"', -            webpage, 'thumbnail', fatal=False) +        thumbnail = jwvideo.get('image')          categories_str = self._html_search_meta(              'keywords', webpage, 'categories') diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 399c3b8ee..2ab30e45f 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -19,20 +19,25 @@ class SVTBaseIE(InfoExtractor):          video_info = info['video']          formats = []          for vr in video_info['videoReferences']: +            player_type = vr.get('playerType')              vurl = vr['url']              ext = determine_ext(vurl)              if ext == 'm3u8':                  formats.extend(self._extract_m3u8_formats(                      vurl, video_id,                      ext='mp4', entry_protocol='m3u8_native', -                    m3u8_id=vr.get('playerType'))) +                    m3u8_id=player_type, fatal=False))              elif ext == 'f4m':                  formats.extend(self._extract_f4m_formats(                      vurl + '?hdcore=3.3.0', video_id, -                    f4m_id=vr.get('playerType'))) +                    f4m_id=player_type, fatal=False)) +            elif ext == 'mpd': +                if player_type == 'dashhbbtv': +                    formats.extend(self._extract_mpd_formats( +                        vurl, video_id, mpd_id=player_type, fatal=False))              else:                  formats.append({ -                    'format_id': vr.get('playerType'), +                    'format_id': player_type,                      'url': vurl,                  })          self._sort_formats(formats) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index a48d77c30..cf8851438 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -73,7 +73,7 @@ class TEDIE(InfoExtractor):          'add_ie': ['Youtube'],          'info_dict': {              'id': '_ZG8HBuDjgc', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'Douglas Adams: Parrots the Universe and Everything',              'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',              'uploader': 'University of California Television (UCTV)', diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py new file mode 100644 index 000000000..b7e9af2af --- /dev/null +++ b/youtube_dl/extractor/thestar.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import compat_parse_qs + + +class TheStarIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P<id>.+)\.html' +    _TEST = { +        'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html', +        'md5': '2c62dd4db2027e35579fefb97a8b6554', +        'info_dict': { +            'id': '4732393888001', +            'ext': 'mp4', +            'title': 'Mankind: Why this woman started a men\'s skin care line', +            'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.', +            'uploader_id': '794267642001', +            'timestamp': 1454353482, +            'upload_date': '20160201', +        } +    } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) +        brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] +        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index adc05ed5f..17add9543 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -4,12 +4,12 @@ import re  from .common import InfoExtractor  from .brightcove import BrightcoveLegacyIE -from ..compat import compat_urlparse +from ..compat import compat_parse_qs  class TlcDeIE(InfoExtractor):      IE_NAME = 'tlc.de' -    _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)' +    _VALID_URL = r'http://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?'      _TEST = {          'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -17,32 +17,23 @@ class TlcDeIE(InfoExtractor):              'id': '3235167922001',              'ext': 'mp4',              'title': 'Breaking Amish: Die Welt da draußen', -            'uploader': 'Discovery Networks - Germany',              'description': (                  'Vier Amische und eine Mennonitin wagen in New York'                  '  den Sprung in ein komplett anderes Leben. Begleitet sie auf'                  ' ihrem spannenden Weg.'), +            'timestamp': 1396598084, +            'upload_date': '20140404', +            'uploader_id': '1659832546',          },      } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'      def _real_extract(self, url):          mobj = re.match(self._VALID_URL, url) -        title = mobj.group('title') -        webpage = self._download_webpage(url, title) -        iframe_url = self._search_regex( -            '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage, -            'iframe url') -        # Otherwise we don't get the correct 'BrightcoveExperience' element, -        # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/ -        iframe_url = iframe_url.replace('.htm?', '.php?') -        url_fragment = compat_urlparse.urlparse(url).fragment -        if url_fragment: -            # Since the fragment is not send to the server, we always get the same iframe -            iframe_url = re.sub(r'playlist=(\d+)', 'playlist=%s' % url_fragment, iframe_url) -        iframe = self._download_webpage(iframe_url, title) - -        return { -            '_type': 'url', -            'url': BrightcoveLegacyIE._extract_brightcove_url(iframe), -            'ie': BrightcoveLegacyIE.ie_key(), -        } +        brightcove_id = mobj.group('id') +        if not brightcove_id: +            title = mobj.group('title') +            webpage = self._download_webpage(url, title) +            brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) +            brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] +        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py new file mode 100644 index 000000000..d3f690dc7 --- /dev/null +++ b/youtube_dl/extractor/tv3.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TV3IE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P<id>[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' +    _TEST = { +        'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', +        'info_dict': { +            'id': '4659127992001', +            'ext': 'mp4', +            'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', +            'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', +            'uploader_id': '3812193411001', +            'upload_date': '20151213', +            'timestamp': 1449975272, +        }, +        'expected_warnings': [ +            'Failed to download MPD manifest' +        ], +        'params': { +            'skip_download': True, +        }, +    } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        brightcove_id = self._search_regex(r'<param\s*name="@videoPlayer"\s*value="(\d+)"', webpage, 'brightcove id') +        return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 5b8586097..d4169ec6d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -17,6 +17,7 @@ from ..utils import (      encode_dict,      ExtractorError,      int_or_none, +    orderedSet,      parse_duration,      parse_iso8601,      sanitized_Request, @@ -251,6 +252,7 @@ class TwitchVodIE(TwitchItemBaseIE):                  self._USHER_BASE, item_id,                  compat_urllib_parse.urlencode({                      'allow_source': 'true', +                    'allow_audio_only': 'true',                      'allow_spectre': 'true',                      'player': 'twitchweb',                      'nauth': access_token['token'], @@ -281,17 +283,37 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):          entries = []          offset = 0          limit = self._PAGE_LIMIT +        broken_paging_detected = False +        counter_override = None          for counter in itertools.count(1):              response = self._download_json(                  self._PLAYLIST_URL % (channel_id, offset, limit), -                channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter)) +                channel_id, +                'Downloading %s videos JSON page %s' +                % (self._PLAYLIST_TYPE, counter_override or counter))              page_entries = self._extract_playlist_page(response)              if not page_entries:                  break +            total = int_or_none(response.get('_total')) +            # Since the beginning of March 2016 twitch's paging mechanism +            # is completely broken on the twitch side. It simply ignores +            # a limit and returns the whole offset number of videos. +            # Working around by just requesting all videos at once. +            # Upd: pagination bug was fixed by twitch on 15.03.2016. +            if not broken_paging_detected and total and len(page_entries) > limit: +                self.report_warning( +                    'Twitch pagination is broken on twitch side, requesting all videos at once', +                    channel_id) +                broken_paging_detected = True +                offset = total +                counter_override = '(all at once)' +                continue              entries.extend(page_entries) +            if broken_paging_detected or total and len(page_entries) >= total: +                break              offset += limit          return self.playlist_result( -            [self.url_result(entry) for entry in set(entries)], +            [self.url_result(entry) for entry in orderedSet(entries)],              channel_id, channel_name)      def _extract_playlist_page(self, response): @@ -303,7 +325,6 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):  class TwitchProfileIE(TwitchPlaylistBaseIE): -    _WORKING = False      IE_NAME = 'twitch:profile'      _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE      _PLAYLIST_TYPE = 'profile' @@ -319,7 +340,6 @@ class TwitchProfileIE(TwitchPlaylistBaseIE):  class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): -    _WORKING = False      IE_NAME = 'twitch:past_broadcasts'      _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE      _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' @@ -336,7 +356,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE):  class TwitchBookmarksIE(TwitchPlaylistBaseIE): -    _WORKING = False      IE_NAME = 'twitch:bookmarks'      _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE      _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE @@ -414,6 +433,7 @@ class TwitchStreamIE(TwitchBaseIE):          query = {              'allow_source': 'true', +            'allow_audio_only': 'true',              'p': random.randint(1000000, 10000000),              'player': 'twitchweb',              'segment_preference': '4', diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 67762a003..e70b2ab3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -102,6 +102,14 @@ class TwitterCardIE(TwitterBaseIE):              r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'),              video_id) +        def _search_dimensions_in_video_url(a_format, video_url): +            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) +            if m: +                a_format.update({ +                    'width': int(m.group('width')), +                    'height': int(m.group('height')), +                }) +          playlist = config.get('playlist')          if playlist:              video_url = playlist[0]['source'] @@ -110,12 +118,8 @@ class TwitterCardIE(TwitterBaseIE):                  'url': video_url,              } -            m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url) -            if m: -                f.update({ -                    'width': int(m.group('width')), -                    'height': int(m.group('height')), -                }) +            _search_dimensions_in_video_url(f, video_url) +              formats.append(f)          vmap_url = config.get('vmapUrl') or config.get('vmap_url') @@ -148,6 +152,8 @@ class TwitterCardIE(TwitterBaseIE):                      if not a_format['vbr']:                          del a_format['vbr'] +                    _search_dimensions_in_video_url(a_format, media_url) +                      formats.append(a_format)              duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) diff --git a/youtube_dl/extractor/usatoday.py b/youtube_dl/extractor/usatoday.py new file mode 100644 index 000000000..e5678dc78 --- /dev/null +++ b/youtube_dl/extractor/usatoday.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( +    get_element_by_attribute, +    parse_duration, +    update_url_query, +    ExtractorError, +) +from ..compat import compat_str + + +class USATodayIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?usatoday\.com/(?:[^/]+/)*(?P<id>[^?/#]+)' +    _TEST = { +        'url': 'http://www.usatoday.com/media/cinematic/video/81729424/us-france-warn-syrian-regime-ahead-of-new-peace-talks/', +        'md5': '4d40974481fa3475f8bccfd20c5361f8', +        'info_dict': { +            'id': '81729424', +            'ext': 'mp4', +            'title': 'US, France warn Syrian regime ahead of new peace talks', +            'timestamp': 1457891045, +            'description': 'md5:7e50464fdf2126b0f533748d3c78d58f', +            'uploader_id': '29906170001', +            'upload_date': '20160313', +        } +    } +    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/29906170001/38a9eecc-bdd8-42a3-ba14-95397e48b3f8_default/index.html?videoId=%s' + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(update_url_query(url, {'ajax': 'true'}), display_id) +        ui_video_data = get_element_by_attribute('class', 'ui-video-data', webpage) +        if not ui_video_data: +            raise ExtractorError('no video on the webpage', expected=True) +        video_data = self._parse_json(ui_video_data, display_id) + +        return { +            '_type': 'url_transparent', +            'url': self.BRIGHTCOVE_URL_TEMPLATE % video_data['brightcove_id'], +            'id': compat_str(video_data['id']), +            'title': video_data['title'], +            'thumbnail': video_data.get('thumbnail'), +            'description': video_data.get('description'), +            'duration': parse_duration(video_data.get('length')), +            'ie_key': 'BrightcoveNew', +        } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 14e945d49..e148b1ef5 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -20,6 +20,7 @@ class VGTVIE(XstreamIE):          'aftenbladet.no/tv': 'satv',          'fvn.no/fvntv': 'fvntv',          'aftenposten.no/webtv': 'aptv', +        'ap.vgtv.no/webtv': 'aptv',      }      _APP_NAME_TO_VENDOR = { @@ -35,7 +36,7 @@ class VGTVIE(XstreamIE):                      (?P<host>                          %s                      ) -                    / +                    /?                      (?:                          \#!/(?:video|live)/|                          embed?.*id= @@ -107,19 +108,27 @@ class VGTVIE(XstreamIE):              'md5': 'fd828cd29774a729bf4d4425fe192972',              'info_dict': {                  'id': '21039', -                'ext': 'mov', +                'ext': 'mp4',                  'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',                  'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',                  'duration': 66,                  'timestamp': 1417002452,                  'upload_date': '20141126',                  'view_count': int, -            } +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',              'only_matching': True,          }, +        { +            'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', +            'only_matching': True, +        },      ]      def _real_extract(self, url): @@ -144,8 +153,6 @@ class VGTVIE(XstreamIE):          if len(video_id) == 5:              if appname == 'bttv':                  info = self._extract_video_info('btno', video_id) -            elif appname == 'aptv': -                info = self._extract_video_info('ap', video_id)          streams = data['streamUrls']          stream_type = data.get('streamType') diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 3db6286e4..46c785ae1 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,31 +1,37 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from .ooyala import OoyalaIE  from ..utils import ExtractorError  class ViceIE(InfoExtractor): -    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P<id>.+)' - -    _TESTS = [ -        { -            'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', -            'info_dict': { -                'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', -                'ext': 'mp4', -                'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', -                'duration': 725.983, -            }, -            'params': { -                # Requires ffmpeg (m3u8 manifest) -                'skip_download': True, -            }, -        }, { -            'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', -            'only_matching': True, -        } -    ] +    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', +        'info_dict': { +            'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', +            'ext': 'mp4', +            'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', +            'duration': 725.983, +        }, +        'params': { +            # Requires ffmpeg (m3u8 manifest) +            'skip_download': True, +        }, +    }, { +        'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', +        'only_matching': True, +    }, { +        'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', +        'only_matching': True, +    }, { +        'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', +        'only_matching': True, +    }]      def _real_extract(self, url):          video_id = self._match_id(url) @@ -38,3 +44,35 @@ class ViceIE(InfoExtractor):          except ExtractorError:              raise ExtractorError('The page doesn\'t contain a video', expected=True)          return self.url_result(ooyala_url, ie='Ooyala') + + +class ViceShowIE(InfoExtractor): +    _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' + +    _TEST = { +        'url': 'https://munchies.vice.com/en/show/fuck-thats-delicious-2', +        'info_dict': { +            'id': 'fuck-thats-delicious-2', +            'title': "Fuck, That's Delicious", +            'description': 'Follow the culinary adventures of rapper Action Bronson during his ongoing world tour.', +        }, +        'playlist_count': 17, +    } + +    def _real_extract(self, url): +        show_id = self._match_id(url) +        webpage = self._download_webpage(url, show_id) + +        entries = [ +            self.url_result(video_url, ViceIE.ie_key()) +            for video_url, _ in re.findall( +                r'<h2[^>]+class="article-title"[^>]+data-id="\d+"[^>]*>\s*<a[^>]+href="(%s.*?)"' +                % ViceIE._VALID_URL, webpage)] + +        title = self._search_regex( +            r'<title>(.+?)</title>', webpage, 'title', default=None) +        if title: +            title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() +        description = self._html_search_meta('description', webpage, 'description') + +        return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 433fc9914..e04b814c8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -176,13 +176,13 @@ class VikiIE(VikiBaseIE):      }, {          # youtube external          'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', -        'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', +        'md5': '63f8600c1da6f01b7640eee7eca4f1da',          'info_dict': {              'id': '50562v', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'Poor Nastya [COMPLETE] - Episode 1',              'description': '', -            'duration': 607, +            'duration': 606,              'timestamp': 1274949505,              'upload_date': '20101213',              'uploader': 'ad14065n', diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 9f282a1da..71c30d2cd 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -73,15 +73,26 @@ class VimeoIE(VimeoBaseInfoExtractor):      # _VALID_URL matches Vimeo URLs      _VALID_URL = r'''(?x) -        https?:// -        (?:(?:www|(?P<player>player))\.)? -        vimeo(?P<pro>pro)?\.com/ -        (?!channels/[^/?#]+/?(?:$|[?#])|album/) -        (?:.*?/)? -        (?:(?:play_redirect_hls|moogaloop\.swf)\?clip_id=)? -        (?:videos?/)? -        (?P<id>[0-9]+) -        /?(?:[?&].*)?(?:[#].*)?$''' +                    https?:// +                        (?: +                            (?: +                                www| +                                (?P<player>player) +                            ) +                            \. +                        )? +                        vimeo(?P<pro>pro)?\.com/ +                        (?!channels/[^/?#]+/?(?:$|[?#])|(?:album|ondemand)/) +                        (?:.*?/)? +                        (?: +                            (?: +                                play_redirect_hls| +                                moogaloop\.swf)\?clip_id= +                            )? +                        (?:videos?/)? +                        (?P<id>[0-9]+) +                        /?(?:[?&].*)?(?:[#].*)?$ +                    '''      IE_NAME = 'vimeo'      _TESTS = [          { @@ -277,9 +288,8 @@ class VimeoIE(VimeoBaseInfoExtractor):      def _real_extract(self, url):          url, data = unsmuggle_url(url, {}) -        headers = std_headers +        headers = std_headers.copy()          if 'http_headers' in data: -            headers = headers.copy()              headers.update(data['http_headers'])          if 'Referer' not in headers:              headers['Referer'] = url @@ -294,7 +304,7 @@ class VimeoIE(VimeoBaseInfoExtractor):              url = 'https://vimeo.com/' + video_id          # Retrieve video webpage to extract further information -        request = sanitized_Request(url, None, headers) +        request = sanitized_Request(url, headers=headers)          try:              webpage = self._download_webpage(request, video_id)          except ExtractorError as ee: @@ -498,6 +508,38 @@ class VimeoIE(VimeoBaseInfoExtractor):          } +class VimeoOndemandIE(VimeoBaseInfoExtractor): +    IE_NAME = 'vimeo:ondemand' +    _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        # ondemand video not available via https://vimeo.com/id +        'url': 'https://vimeo.com/ondemand/20704', +        'md5': 'c424deda8c7f73c1dfb3edd7630e2f35', +        'info_dict': { +            'id': '105442900', +            'ext': 'mp4', +            'title': 'המעבדה - במאי יותם פלדמן', +            'uploader': 'גם סרטים', +            'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms', +            'uploader_id': 'gumfilms', +        }, +    }, { +        'url': 'https://vimeo.com/ondemand/nazmaalik', +        'only_matching': True, +    }, { +        'url': 'https://vimeo.com/ondemand/141692381', +        'only_matching': True, +    }, { +        'url': 'https://vimeo.com/ondemand/thelastcolony/150274832', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) +        return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) + +  class VimeoChannelIE(VimeoBaseInfoExtractor):      IE_NAME = 'vimeo:channel'      _VALID_URL = r'https://vimeo\.com/channels/(?P<id>[^/?#]+)/?(?:$|[?#])' diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 670a438af..d560a4b5e 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -142,10 +142,10 @@ class VKIE(InfoExtractor):              'url': 'https://vk.com/video276849682_170681728',              'info_dict': {                  'id': 'V3K4mi0SYkc', -                'ext': 'mp4', +                'ext': 'webm',                  'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",                  'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', -                'duration': 179, +                'duration': 178,                  'upload_date': '20130116',                  'uploader': "Children's Joy Foundation",                  'uploader_id': 'thecjf', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index 041ff6c55..fb0accac7 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -20,7 +20,7 @@ class WimpIE(InfoExtractor):          'md5': '4e2986c793694b55b37cf92521d12bb4',          'info_dict': {              'id': 'clowncar', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'It\'s like a clown car.',              'description': 'md5:0e56db1370a6e49c5c1d19124c0d2fb2',          }, diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index d3cc1a29f..e699e663f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,13 +10,27 @@ from ..compat import (      compat_urllib_parse,  )  from ..utils import ( +    ExtractorError,      int_or_none,      float_or_none,      sanitized_Request,  ) -class YandexMusicTrackIE(InfoExtractor): +class YandexMusicBaseIE(InfoExtractor): +    @staticmethod +    def _handle_error(response): +        error = response.get('error') +        if error: +            raise ExtractorError(error, expected=True) + +    def _download_json(self, *args, **kwargs): +        response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) +        self._handle_error(response) +        return response + + +class YandexMusicTrackIE(YandexMusicBaseIE):      IE_NAME = 'yandexmusic:track'      IE_DESC = 'Яндекс.Музыка - Трек'      _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)' @@ -73,7 +87,7 @@ class YandexMusicTrackIE(InfoExtractor):          return self._get_track_info(track) -class YandexMusicPlaylistBaseIE(InfoExtractor): +class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):      def _build_playlist(self, tracks):          return [              self.url_result( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index b29baafc4..1124fe6c2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor):          links = []          sources = self._search_regex( -            r'sources\s*:\s*({.+?})', webpage, 'sources', default=None) +            r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)          if sources:              for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):                  links.append(link) @@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor):              }              # Video URL's path looks like this:              #  /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 +            #  /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4              # We will benefit from it by extracting some metadata -            mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url) +            mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)              if mobj:                  height = int(mobj.group('height'))                  bitrate = int(mobj.group('bitrate')) diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py index 0d8ef6ca2..3ea518399 100644 --- a/youtube_dl/postprocessor/__init__.py +++ b/youtube_dl/postprocessor/__init__.py @@ -6,6 +6,7 @@ from .ffmpeg import (      FFmpegEmbedSubtitlePP,      FFmpegExtractAudioPP,      FFmpegFixupStretchedPP, +    FFmpegFixupM3u8PP,      FFmpegFixupM4aPP,      FFmpegMergerPP,      FFmpegMetadataPP, @@ -26,6 +27,7 @@ __all__ = [      'ExecAfterDownloadPP',      'FFmpegEmbedSubtitlePP',      'FFmpegExtractAudioPP', +    'FFmpegFixupM3u8PP',      'FFmpegFixupM4aPP',      'FFmpegFixupStretchedPP',      'FFmpegMergerPP', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 380bc6f29..a8819f258 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -25,6 +25,19 @@ from ..utils import (  ) +EXT_TO_OUT_FORMATS = { +    "aac": "adts", +    "m4a": "ipod", +    "mka": "matroska", +    "mkv": "matroska", +    "mpg": "mpeg", +    "ogv": "ogg", +    "ts": "mpegts", +    "wma": "asf", +    "wmv": "asf", +} + +  class FFmpegPostProcessorError(PostProcessingError):      pass @@ -391,10 +404,6 @@ class FFmpegMetadataPP(FFmpegPostProcessor):          for (name, value) in metadata.items():              options.extend(['-metadata', '%s=%s' % (name, value)]) -        # https://github.com/rg3/youtube-dl/issues/8350 -        if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False): -            options.extend(['-bsf:a', 'aac_adtstoasc']) -          self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)          self.run_ffmpeg(filename, temp_filename, options)          os.remove(encodeFilename(filename)) @@ -467,6 +476,21 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor):          return [], info +class FFmpegFixupM3u8PP(FFmpegPostProcessor): +    def run(self, info): +        filename = info['filepath'] +        temp_filename = prepend_extension(filename, 'temp') + +        options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] +        self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename) +        self.run_ffmpeg(filename, temp_filename, options) + +        os.remove(encodeFilename(filename)) +        os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + +        return [], info + +  class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):      def __init__(self, downloader=None, format=None):          super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 480d48d05..e39ca60aa 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -6,6 +6,7 @@ import sys  import errno  from .common import PostProcessor +from ..compat import compat_os_name  from ..utils import (      check_executable,      hyphenate_date, @@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor):                      raise XAttrMetadataError(e.errno, e.strerror)          except ImportError: -            if os.name == 'nt': +            if compat_os_name == 'nt':                  # Write xattrs to NTFS Alternate Data Streams:                  # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29                  def write_xattr(path, key, value): @@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor):                      'Unable to write extended attributes due to too long values.')              else:                  msg = 'This filesystem doesn\'t support extended attributes. ' -                if os.name == 'nt': +                if compat_os_name == 'nt':                      msg += 'You need to use NTFS.'                  else:                      msg += '(You may have to enable them in your /etc/fstab)' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a0234a3a8..ec186918c 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -495,6 +495,10 @@ def encodeFilename(s, for_subprocess=False):      if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:          return s +    # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible +    if sys.platform.startswith('java'): +        return s +      return s.encode(get_subprocess_encoding(), 'ignore') @@ -1245,13 +1249,23 @@ if sys.platform == 'win32':              raise OSError('Unlocking file failed: %r' % ctypes.FormatError())  else: -    import fcntl +    # Some platforms, such as Jython, is missing fcntl +    try: +        import fcntl -    def _lock_file(f, exclusive): -        fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) +        def _lock_file(f, exclusive): +            fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) -    def _unlock_file(f): -        fcntl.flock(f, fcntl.LOCK_UN) +        def _unlock_file(f): +            fcntl.flock(f, fcntl.LOCK_UN) +    except ImportError: +        UNSUPPORTED_MSG = 'file locking is not supported on this platform' + +        def _lock_file(f, exclusive): +            raise IOError(UNSUPPORTED_MSG) + +        def _unlock_file(f): +            raise IOError(UNSUPPORTED_MSG)  class locked_file(object): @@ -1332,6 +1346,17 @@ def format_bytes(bytes):      return '%.2f%s' % (converted, suffix) +def lookup_unit_table(unit_table, s): +    units_re = '|'.join(re.escape(u) for u in unit_table) +    m = re.match( +        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) +    if not m: +        return None +    num_str = m.group('num').replace(',', '.') +    mult = unit_table[m.group('unit')] +    return int(float(num_str) * mult) + +  def parse_filesize(s):      if s is None:          return None @@ -1375,15 +1400,28 @@ def parse_filesize(s):          'Yb': 1000 ** 8,      } -    units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE) -    m = re.match( -        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) -    if not m: +    return lookup_unit_table(_UNIT_TABLE, s) + + +def parse_count(s): +    if s is None:          return None -    num_str = m.group('num').replace(',', '.') -    mult = _UNIT_TABLE[m.group('unit')] -    return int(float(num_str) * mult) +    s = s.strip() + +    if re.match(r'^[\d,.]+$', s): +        return str_to_int(s) + +    _UNIT_TABLE = { +        'k': 1000, +        'K': 1000, +        'm': 1000 ** 2, +        'M': 1000 ** 2, +        'kk': 1000 ** 2, +        'KK': 1000 ** 2, +    } + +    return lookup_unit_table(_UNIT_TABLE, s)  def month_by_name(name): @@ -1415,6 +1453,12 @@ def fix_xml_ampersands(xml_str):  def setproctitle(title):      assert isinstance(title, compat_str) + +    # ctypes in Jython is not complete +    # http://bugs.jython.org/issue2148 +    if sys.platform.startswith('java'): +        return +      try:          libc = ctypes.cdll.LoadLibrary('libc.so.6')      except OSError: @@ -1749,6 +1793,15 @@ def urlencode_postdata(*args, **kargs):      return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +def update_url_query(url, query): +    parsed_url = compat_urlparse.urlparse(url) +    qs = compat_parse_qs(parsed_url.query) +    qs.update(query) +    qs = encode_dict(qs) +    return compat_urlparse.urlunparse(parsed_url._replace( +        query=compat_urllib_parse.urlencode(qs, True))) + +  def encode_dict(d, encoding='utf-8'):      def encode(v):          return v.encode(encoding) if isinstance(v, compat_basestring) else v diff --git a/youtube_dl/version.py b/youtube_dl/version.py index adafd601b..9216fa547 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@  from __future__ import unicode_literals -__version__ = '2016.03.01' +__version__ = '2016.03.14' | 
