diff options
40 files changed, 916 insertions, 320 deletions
diff --git a/.gitignore b/.gitignore index 0422adf44..26dbde73d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@  *.pyc  *.pyo +*.class  *~  *.DS_Store  wine-py2exe/ @@ -32,4 +33,4 @@ test/testdata  .tox  youtube-dl.zsh  .idea -.idea/*
\ No newline at end of file +.idea/* @@ -3,6 +3,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas  clean:  	rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe  	find . -name "*.pyc" -delete +	find . -name "*.class" -delete  PREFIX ?= /usr/local  BINDIR ?= $(PREFIX)/bin @@ -44,7 +45,7 @@ test:  ot: offlinetest  offlinetest: codetest -	nosetests --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py +	$(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py  tar: youtube-dl.tar.gz @@ -458,6 +458,7 @@ The basic usage is not to set any template arguments when downloading a single f   - `alt_title`: A secondary title of the video   - `display_id`: An alternative identifier for the video   - `uploader`: Full name of the video uploader + - `license`: License name the video is licensed under   - `creator`: The main artist who created the video   - `release_date`: The date (YYYYMMDD) when the video was released   - `timestamp`: UNIX timestamp of the moment the video became available diff --git a/test/helper.py b/test/helper.py index bdd7acca4..f2d878212 100644 --- a/test/helper.py +++ b/test/helper.py @@ -11,8 +11,11 @@ import sys  import youtube_dl.extractor  from youtube_dl import YoutubeDL -from youtube_dl.utils import ( +from youtube_dl.compat import ( +    compat_os_name,      compat_str, +) +from youtube_dl.utils import (      preferredencoding,      write_string,  ) @@ -42,7 +45,7 @@ def report_warning(message):      Print the message to stderr, it will be prefixed with 'WARNING:'      If stderr is a tty file the 'WARNING:' will be colored      ''' -    if sys.stderr.isatty() and os.name != 'nt': +    if sys.stderr.isatty() and compat_os_name != 'nt':          _msg_header = '\033[0;33mWARNING:\033[0m'      else:          _msg_header = 'WARNING:' diff --git a/test/test_http.py b/test/test_http.py index f2e305b6f..fc59b1aed 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -52,7 +52,12 @@ class TestHTTP(unittest.TestCase):              ('localhost', 0), HTTPTestRequestHandler)          self.httpd.socket = ssl.wrap_socket(              self.httpd.socket, certfile=certfn, server_side=True) -        self.port = self.httpd.socket.getsockname()[1] +        if os.name == 'java': +            # In Jython SSLSocket is not a subclass of socket.socket +            sock = self.httpd.socket.sock +        else: +            sock = self.httpd.socket +        self.port = sock.getsockname()[1]          self.server_thread = threading.Thread(target=self.httpd.serve_forever)          self.server_thread.daemon = True          self.server_thread.start() diff --git a/test/test_utils.py b/test/test_utils.py index 97587ad2f..2bcf8ecf0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -61,6 +61,7 @@ from youtube_dl.utils import (      lowercase_escape,      url_basename,      urlencode_postdata, +    update_url_query,      version_tuple,      xpath_with_ns,      xpath_element, @@ -76,6 +77,8 @@ from youtube_dl.utils import (  )  from youtube_dl.compat import (      compat_etree_fromstring, +    compat_urlparse, +    compat_parse_qs,  ) @@ -454,6 +457,40 @@ class TestUtil(unittest.TestCase):          data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})          self.assertTrue(isinstance(data, bytes)) +    def test_update_url_query(self): +        def query_dict(url): +            return compat_parse_qs(compat_urlparse.urlparse(url).query) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'quality': ['HD'], 'format': ['mp4']})), +            query_dict('http://example.com/path?quality=HD&format=mp4')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'system': ['LINUX', 'WINDOWS']})), +            query_dict('http://example.com/path?system=LINUX&system=WINDOWS')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'fields': 'id,formats,subtitles'})), +            query_dict('http://example.com/path?fields=id,formats,subtitles')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'fields': ('id,formats,subtitles', 'thumbnails')})), +            query_dict('http://example.com/path?fields=id,formats,subtitles&fields=thumbnails')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path?manifest=f4m', {'manifest': []})), +            query_dict('http://example.com/path')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path?system=LINUX&system=WINDOWS', {'system': 'LINUX'})), +            query_dict('http://example.com/path?system=LINUX')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'fields': b'id,formats,subtitles'})), +            query_dict('http://example.com/path?fields=id,formats,subtitles')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'width': 1080, 'height': 720})), +            query_dict('http://example.com/path?width=1080&height=720')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'bitrate': 5020.43})), +            query_dict('http://example.com/path?bitrate=5020.43')) +        self.assertEqual(query_dict(update_url_query( +            'http://example.com/path', {'test': '第二行тест'})), +            query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82')) +      def test_dict_get(self):          FALSE_VALUES = {              'none': None, diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 97cf31eb2..94e4ea432 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,9 +24,6 @@ import time  import tokenize  import traceback -if os.name == 'nt': -    import ctypes -  from .compat import (      compat_basestring,      compat_cookiejar, @@ -34,6 +31,7 @@ from .compat import (      compat_get_terminal_size,      compat_http_client,      compat_kwargs, +    compat_os_name,      compat_str,      compat_tokenize_tokenize,      compat_urllib_error, @@ -96,6 +94,9 @@ from .postprocessor import (  )  from .version import __version__ +if compat_os_name == 'nt': +    import ctypes +  class YoutubeDL(object):      """YoutubeDL class. @@ -451,7 +452,7 @@ class YoutubeDL(object):      def to_console_title(self, message):          if not self.params.get('consoletitle', False):              return -        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): +        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():              # c_wchar_p() might not be necessary if `message` is              # already of type unicode()              ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) @@ -522,7 +523,7 @@ class YoutubeDL(object):          else:              if self.params.get('no_warnings'):                  return -            if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': +            if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':                  _msg_header = '\033[0;33mWARNING:\033[0m'              else:                  _msg_header = 'WARNING:' @@ -534,7 +535,7 @@ class YoutubeDL(object):          Do the same as trouble, but prefixes the message with 'ERROR:', colored          in red if stderr is a tty file.          ''' -        if not self.params.get('no_color') and self._err_file.isatty() and os.name != 'nt': +        if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':              _msg_header = '\033[0;31mERROR:\033[0m'          else:              _msg_header = 'ERROR:' @@ -567,7 +568,7 @@ class YoutubeDL(object):                  elif template_dict.get('height'):                      template_dict['resolution'] = '%sp' % template_dict['height']                  elif template_dict.get('width'): -                    template_dict['resolution'] = '?x%d' % template_dict['width'] +                    template_dict['resolution'] = '%dx?' % template_dict['width']              sanitize = lambda k, v: sanitize_filename(                  compat_str(v), @@ -1632,7 +1633,7 @@ class YoutubeDL(object):                  self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))                  return -            if success: +            if success and filename != '-':                  # Fixup content                  fixup_policy = self.params.get('fixup')                  if fixup_policy is None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f5f064241..79b389840 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -355,6 +355,7 @@ def _real_main(argv=None):          'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,          'encoding': opts.encoding,          'extract_flat': opts.extract_flat, +        'mark_watched': opts.mark_watched,          'merge_output_format': opts.merge_output_format,          'postprocessors': postprocessors,          'fixup': opts.fixup, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..2771fb5fa 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -326,6 +326,9 @@ def compat_ord(c):          return ord(c) +compat_os_name = os._name if os.name == 'java' else os.name + +  if sys.version_info >= (3, 0):      compat_getenv = os.getenv      compat_expanduser = os.path.expanduser @@ -346,7 +349,7 @@ else:      # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib      # for different platforms with correct environment variables decoding. -    if os.name == 'posix': +    if compat_os_name == 'posix':          def compat_expanduser(path):              """Expand ~ and ~user constructions.  If user or $HOME is unknown,              do nothing.""" @@ -370,7 +373,7 @@ else:                  userhome = pwent.pw_dir              userhome = userhome.rstrip('/')              return (userhome + path[i:]) or '/' -    elif os.name == 'nt' or os.name == 'ce': +    elif compat_os_name == 'nt' or compat_os_name == 'ce':          def compat_expanduser(path):              """Expand ~ and ~user constructs. @@ -556,6 +559,7 @@ __all__ = [      'compat_itertools_count',      'compat_kwargs',      'compat_ord', +    'compat_os_name',      'compat_parse_qs',      'compat_print',      'compat_shlex_split', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 2d5154051..f39db58f6 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -5,6 +5,7 @@ import re  import sys  import time +from ..compat import compat_os_name  from ..utils import (      encodeFilename,      error_to_compat_str, @@ -219,7 +220,7 @@ class FileDownloader(object):          if self.params.get('progress_with_newline', False):              self.to_screen(fullmsg)          else: -            if os.name == 'nt': +            if compat_os_name == 'nt':                  prev_len = getattr(self, '_report_progress_prev_line_length',                                     0)                  if prev_len > len(fullmsg): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5817140c0..899bf8114 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -23,7 +23,10 @@ from .alphaporno import AlphaPornoIE  from .animeondemand import AnimeOnDemandIE  from .anitube import AnitubeIE  from .anysex import AnySexIE -from .aol import AolIE +from .aol import ( +    AolIE, +    AolFeaturesIE, +)  from .allocine import AllocineIE  from .aparat import AparatIE  from .appleconnect import AppleConnectIE @@ -340,6 +343,7 @@ from .konserthusetplay import KonserthusetPlayIE  from .kontrtube import KontrTubeIE  from .krasview import KrasViewIE  from .ku6 import Ku6IE +from .kusi import KUSIIE  from .kuwo import (      KuwoIE,      KuwoAlbumIE, @@ -352,10 +356,9 @@ from .la7 import LA7IE  from .laola1tv import Laola1TvIE  from .lecture2go import Lecture2GoIE  from .lemonde import LemondeIE -from .letv import ( -    LetvIE, -    LetvTvIE, -    LetvPlaylistIE, +from .leeco import ( +    LeIE, +    LePlaylistIE,      LetvCloudIE,  )  from .libsyn import LibsynIE @@ -506,6 +509,7 @@ from .npr import NprIE  from .nrk import (      NRKIE,      NRKPlaylistIE, +    NRKSkoleIE,      NRKTVIE,  )  from .ntvde import NTVDeIE diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b51eafc45..b761b2cc4 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,24 +1,11 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor  class AolIE(InfoExtractor):      IE_NAME = 'on.aol.com' -    _VALID_URL = r'''(?x) -        (?: -            aol-video:| -            http://on\.aol\.com/ -            (?: -                video/.*-| -                playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid= -            ) -        ) -        (?P<id>[0-9]+) -        (?:$|\?) -    ''' +    _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)'      _TESTS = [{          'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -29,42 +16,31 @@ class AolIE(InfoExtractor):              'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',          },          'add_ie': ['FiveMin'], -    }, { -        'url': 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316', -        'info_dict': { -            'id': '152147', -            'title': 'Brace Yourself - Today\'s Weirdest News', -        }, -        'playlist_mincount': 10,      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id = mobj.group('id') -        playlist_id = mobj.group('playlist_id') -        if not playlist_id or self._downloader.params.get('noplaylist'): -            return self.url_result('5min:%s' % video_id) +        video_id = self._match_id(url) +        return self.url_result('5min:%s' % video_id) -        self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) -        webpage = self._download_webpage(url, playlist_id) -        title = self._html_search_regex( -            r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title') -        playlist_html = self._search_regex( -            r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage, -            'playlist HTML') -        entries = [{ -            '_type': 'url', -            'url': 'aol-video:%s' % m.group('id'), -            'ie_key': 'Aol', -        } for m in re.finditer( -            r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>", -            playlist_html)] +class AolFeaturesIE(InfoExtractor): +    IE_NAME = 'features.aol.com' +    _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)' -        return { -            '_type': 'playlist', -            'id': playlist_id, -            'display_id': mobj.group('playlist_display_id'), -            'title': title, -            'entries': entries, -        } +    _TESTS = [{ +        'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', +        'md5': '7db483bb0c09c85e241f84a34238cc75', +        'info_dict': { +            'id': '519507715', +            'ext': 'mp4', +            'title': 'What To Watch - February 17, 2016', +        }, +        'add_ie': ['FiveMin'], +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        return self.url_result(self._search_regex( +            r'<script type="text/javascript" src="(https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js[^"]+)"', +            webpage, '5min embed url'), 'FiveMin') diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py index 3b2effa15..aa6925623 100644 --- a/youtube_dl/extractor/audimedia.py +++ b/youtube_dl/extractor/audimedia.py @@ -10,9 +10,9 @@ from ..utils import (  class AudiMediaIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P<id>[^/?#]+)' +    _VALID_URL = r'https?://(?:www\.)?audi-mediacenter\.com/(?:en|de)/audimediatv/(?P<id>[^/?#]+)'      _TEST = { -        'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', +        'url': 'https://www.audi-mediacenter.com/en/audimediatv/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test-1467',          'md5': '79a8b71c46d49042609795ab59779b66',          'info_dict': {              'id': '1565', @@ -32,7 +32,10 @@ class AudiMediaIE(InfoExtractor):          display_id = self._match_id(url)          webpage = self._download_webpage(url, display_id) -        raw_payload = self._search_regex(r'<script[^>]+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') +        raw_payload = self._search_regex([ +            r'class="amtv-embed"[^>]+id="([^"]+)"', +            r'class=\\"amtv-embed\\"[^>]+id=\\"([^"]+)\\"', +        ], webpage, 'raw payload')          _, stage_mode, video_id, lang = raw_payload.split('-')          # TODO: handle s and e stage_mode (live streams and ended live streams) @@ -59,13 +62,19 @@ class AudiMediaIE(InfoExtractor):                  video_version_url = video_version.get('download_url') or video_version.get('stream_url')                  if not video_version_url:                      continue -                formats.append({ +                f = {                      'url': video_version_url,                      'width': int_or_none(video_version.get('width')),                      'height': int_or_none(video_version.get('height')),                      'abr': int_or_none(video_version.get('audio_bitrate')),                      'vbr': int_or_none(video_version.get('video_bitrate')), -                }) +                } +                bitrate = self._search_regex(r'(\d+)k', video_version_url, 'bitrate', default=None) +                if bitrate: +                    f.update({ +                        'format_id': 'http-%s' % bitrate, +                    }) +                formats.append(f)              self._sort_formats(formats)              return { diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 38bda3af5..7a8e1f60b 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -28,10 +28,10 @@ class BleacherReportIE(InfoExtractor):          'add_ie': ['Ooyala'],      }, {          'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', -        'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', +        'md5': '6a5cd403418c7b01719248ca97fb0692',          'info_dict': {              'id': '2586817', -            'ext': 'mp4', +            'ext': 'webm',              'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',              'timestamp': 1446839961,              'uploader': 'Sean Fay', @@ -93,10 +93,14 @@ class BleacherReportCMSIE(AMPIE):          'md5': '8c2c12e3af7805152675446c905d159b',          'info_dict': {              'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', -            'ext': 'flv', +            'ext': 'mp4',              'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',              'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',          }, +        'params': { +            # m3u8 download +            'skip_download': True, +        },      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3f16b1b9e..07bd2cbe2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,13 +15,14 @@ import math  from ..compat import (      compat_cookiejar,      compat_cookies, +    compat_etree_fromstring,      compat_getpass,      compat_http_client, +    compat_os_name, +    compat_str,      compat_urllib_error,      compat_urllib_parse,      compat_urlparse, -    compat_str, -    compat_etree_fromstring,  )  from ..utils import (      NO_DEFAULT, @@ -157,12 +158,14 @@ class InfoExtractor(object):      thumbnail:      Full URL to a video thumbnail image.      description:    Full video description.      uploader:       Full name of the video uploader. +    license:        License name the video is licensed under.      creator:        The main artist who created the video.      release_date:   The date (YYYYMMDD) when the video was released.      timestamp:      UNIX timestamp of the moment the video became available.      upload_date:    Video upload date (YYYYMMDD).                      If not explicitly set, calculated from timestamp.      uploader_id:    Nickname or id of the video uploader. +    uploader_url:   Full URL to a personal webpage of the video uploader.      location:       Physical location where the video was filmed.      subtitles:      The available subtitles as a dictionary in the format                      {language: subformats}. "subformats" is a list sorted from @@ -425,7 +428,7 @@ class InfoExtractor(object):              self.to_screen('Saving request to ' + filename)              # Working around MAX_PATH limitation on Windows (see              # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) -            if os.name == 'nt': +            if compat_os_name == 'nt':                  absfilepath = os.path.abspath(filename)                  if len(absfilepath) > 259:                      filename = '\\\\?\\' + absfilepath @@ -594,7 +597,7 @@ class InfoExtractor(object):                  if mobj:                      break -        if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): +        if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():              _name = '\033[0;34m%s\033[0m' % name          else:              _name = name @@ -1620,6 +1623,15 @@ class InfoExtractor(object):      def _get_automatic_captions(self, *args, **kwargs):          raise NotImplementedError('This method must be implemented by subclasses') +    def mark_watched(self, *args, **kwargs): +        if (self._downloader.params.get('mark_watched', False) and +                (self._get_login_info()[0] is not None or +                    self._downloader.params.get('cookiefile') is not None)): +            self._mark_watched(*args, **kwargs) + +    def _mark_watched(self, *args, **kwargs): +        raise NotImplementedError('This method must be implemented by subclasses') +  class SearchInfoExtractor(InfoExtractor):      """ diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 373b3b4b4..bdc768c78 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ class DouyuTVIE(InfoExtractor):              'display_id': 'iseven',              'ext': 'flv',              'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', -            'description': 'md5:c93d6692dde6fe33809a46edcbecca44', +            'description': 'md5:f34981259a03e980a3c6404190a3ed61',              'thumbnail': 're:^https?://.*\.jpg$',              'uploader': '7师傅',              'uploader_id': '431925', @@ -26,7 +26,7 @@ class DouyuTVIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        },      }, {          'url': 'http://www.douyutv.com/85982',          'info_dict': { @@ -42,7 +42,24 @@ class DouyuTVIE(InfoExtractor):          },          'params': {              'skip_download': True, -        } +        }, +        'skip': 'Romm not found', +    }, { +        'url': 'http://www.douyutv.com/17732', +        'info_dict': { +            'id': '17732', +            'display_id': '17732', +            'ext': 'flv', +            'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', +            'description': 'md5:f34981259a03e980a3c6404190a3ed61', +            'thumbnail': 're:^https?://.*\.jpg$', +            'uploader': '7师傅', +            'uploader_id': '431925', +            'is_live': True, +        }, +        'params': { +            'skip_download': True, +        },      }]      def _real_extract(self, url): diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 00a69e631..8c725a4e6 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -9,7 +9,7 @@ class ElPaisIE(InfoExtractor):      _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'      IE_DESC = 'El País' -    _TEST = { +    _TESTS = [{          'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',          'md5': '98406f301f19562170ec071b83433d55',          'info_dict': { @@ -19,30 +19,41 @@ class ElPaisIE(InfoExtractor):              'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',              'upload_date': '20140206',          } -    } +    }, { +        'url': 'http://elcomidista.elpais.com/elcomidista/2016/02/24/articulo/1456340311_668921.html#?id_externo_nwl=newsletter_diaria20160303t', +        'md5': '3bd5b09509f3519d7d9e763179b013de', +        'info_dict': { +            'id': '1456340311_668921', +            'ext': 'mp4', +            'title': 'Cómo hacer el mejor café con cafetera italiana', +            'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', +            'upload_date': '20160303', +        } +    }]      def _real_extract(self, url):          video_id = self._match_id(url)          webpage = self._download_webpage(url, video_id)          prefix = self._html_search_regex( -            r'var url_cache = "([^"]+)";', webpage, 'URL prefix') +            r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix')          video_suffix = self._search_regex( -            r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL') +            r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL')          video_url = prefix + video_suffix          thumbnail_suffix = self._search_regex( -            r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL', -            fatal=False) +            r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", +            webpage, 'thumbnail URL', fatal=False)          thumbnail = (              None if thumbnail_suffix is None              else prefix + thumbnail_suffix)          title = self._html_search_regex( -            '<h2 class="entry-header entry-title.*?>(.*?)</h2>', +            (r"tituloVideo\s*=\s*'([^']+)'", webpage, 'title', +             r'<h2 class="entry-header entry-title.*?>(.*?)</h2>'),              webpage, 'title') -        date_str = self._search_regex( +        upload_date = unified_strdate(self._search_regex(              r'<p class="date-header date-int updated"\s+title="([^"]+)">', -            webpage, 'upload date', fatal=False) -        upload_date = (None if date_str is None else unified_strdate(date_str)) +            webpage, 'upload date', default=None) or self._html_search_meta( +            'datePublished', webpage, 'timestamp'))          return {              'id': video_id, diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e4180701d..e5e57d485 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -1,21 +1,13 @@  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import ( -    url_basename, -)  class EngadgetIE(InfoExtractor): -    _VALID_URL = r'''(?x)https?://www.engadget.com/ -        (?:video(?:/5min)?/(?P<id>\d+)| -            [\d/]+/.*?) -        ''' +    _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'      _TEST = { -        'url': 'http://www.engadget.com/video/5min/518153925/', +        'url': 'http://www.engadget.com/video/518153925/',          'md5': 'c6820d4828a5064447a4d9fc73f312c9',          'info_dict': {              'id': '518153925', @@ -27,15 +19,4 @@ class EngadgetIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) - -        if video_id is not None: -            return self.url_result('5min:%s' % video_id) -        else: -            title = url_basename(url) -            webpage = self._download_webpage(url, title) -            ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage) -            return { -                '_type': 'playlist', -                'title': title, -                'entries': [self.url_result('5min:%s' % vid) for vid in ids] -            } +        return self.url_result('5min:%s' % video_id) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 2955965d9..67d50a386 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,5 +1,7 @@  from __future__ import unicode_literals +import re +  from .common import InfoExtractor  from ..compat import (      compat_urllib_parse, @@ -16,12 +18,7 @@ from ..utils import (  class FiveMinIE(InfoExtractor):      IE_NAME = '5min' -    _VALID_URL = r'''(?x) -        (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?:.*?&)?playList=| -            https?://(?:(?:massively|www)\.)?joystiq\.com/video/| -            5min:) -        (?P<id>\d+) -        ''' +    _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'      _TESTS = [          { @@ -45,6 +42,7 @@ class FiveMinIE(InfoExtractor):                  'title': 'How to Make a Next-Level Fruit Salad',                  'duration': 184,              }, +            'skip': 'no longer available',          },      ]      _ERRORS = { @@ -91,20 +89,33 @@ class FiveMinIE(InfoExtractor):      }      def _real_extract(self, url): -        video_id = self._match_id(url) +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        sid = mobj.group('sid') + +        if mobj.group('query'): +            qs = compat_parse_qs(mobj.group('query')) +            if not qs.get('playList'): +                raise ExtractorError('Invalid URL', expected=True) +            video_id = qs['playList'][0] +            if qs.get('sid'): +                sid = qs['sid'][0] +          embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id -        embed_page = self._download_webpage(embed_url, video_id, -                                            'Downloading embed page') -        sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') -        query = compat_urllib_parse.urlencode({ -            'func': 'GetResults', -            'playlist': video_id, -            'sid': sid, -            'isPlayerSeed': 'true', -            'url': embed_url, -        }) +        if not sid: +            embed_page = self._download_webpage(embed_url, video_id, +                                                'Downloading embed page') +            sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') +          response = self._download_json( -            'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, +            'https://syn.5min.com/handlers/SenseHandler.ashx?' + +            compat_urllib_parse.urlencode({ +                'func': 'GetResults', +                'playlist': video_id, +                'sid': sid, +                'isPlayerSeed': 'true', +                'url': embed_url, +            }),              video_id)          if not response['success']:              raise ExtractorError( @@ -118,9 +129,7 @@ class FiveMinIE(InfoExtractor):          parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(              compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])          for rendition in info['Renditions']: -            if rendition['RenditionType'] == 'm3u8': -                formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) -            elif rendition['RenditionType'] == 'aac': +            if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':                  continue              else:                  rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 318ac013d..1dc50318c 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -36,6 +36,10 @@ class FoxNewsIE(AMPIE):                  # 'upload_date': '20141204',                  'thumbnail': 're:^https?://.*\.jpg$',              }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 12fb5e8e1..9622f198a 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -73,7 +73,7 @@ class IndavideoEmbedIE(InfoExtractor):              'url': self._proto_relative_url(thumbnail)          } for thumbnail in video.get('thumbnails', [])] -        tags = [tag['title'] for tag in video.get('tags', [])] +        tags = [tag['title'] for tag in video.get('tags') or []]          return {              'id': video.get('id') or video_id, diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index eef7daa29..137db873c 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -30,7 +30,7 @@ class JeuxVideoIE(InfoExtractor):          webpage = self._download_webpage(url, title)          title = self._html_search_meta('name', webpage) or self._og_search_title(webpage)          config_url = self._html_search_regex( -            r'data-src="(/contenu/medias/video.php.*?)"', +            r'data-src(?:set-video)?="(/contenu/medias/video.php.*?)"',              webpage, 'config URL')          config_url = 'http://www.jeuxvideo.com' + config_url diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py new file mode 100644 index 000000000..931f34c9b --- /dev/null +++ b/youtube_dl/extractor/kusi.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus +from ..utils import ( +    int_or_none, +    float_or_none, +    timeconvert, +    update_url_query, +    xpath_text, +) + + +class KUSIIE(InfoExtractor): +    _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' +    _TESTS = [{ +        'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', +        'md5': 'f926e7684294cf8cb7bdf8858e1b3988', +        'info_dict': { +            'id': '12203019', +            'ext': 'mp4', +            'title': 'Turko Files: Case Closed! & Put On Hold!', +            'duration': 231.0, +            'upload_date': '20160210', +            'timestamp': 1455087571, +            'thumbnail': 're:^https?://.*\.jpg$' +        }, +    }, { +        'url': 'http://kusi.com/video?clipId=12203019', +        'info_dict': { +            'id': '12203019', +            'ext': 'mp4', +            'title': 'Turko Files: Case Closed! & Put On Hold!', +            'duration': 231.0, +            'upload_date': '20160210', +            'timestamp': 1455087571, +            'thumbnail': 're:^https?://.*\.jpg$' +        }, +        'params': { +            'skip_download': True,  # Same as previous one +        }, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        clip_id = mobj.group('clipId') +        video_id = clip_id or mobj.group('path') + +        webpage = self._download_webpage(url, video_id) + +        if clip_id is None: +            video_id = clip_id = self._html_search_regex( +                r'"clipId"\s*,\s*"(\d+)"', webpage, 'clip id') + +        affiliate_id = self._search_regex( +            r'affiliateId\s*:\s*\'([^\']+)\'', webpage, 'affiliate id') + +        # See __Packages/worldnow/model/GalleryModel.as of WNGallery.swf +        xml_url = update_url_query('http://www.kusi.com/build.asp', { +            'buildtype': 'buildfeaturexmlrequest', +            'featureType': 'Clip', +            'featureid': clip_id, +            'affiliateno': affiliate_id, +            'clientgroupid': '1', +            'rnd': int(round(random.random() * 1000000)), +        }) + +        doc = self._download_xml(xml_url, video_id) + +        video_title = xpath_text(doc, 'HEADLINE', fatal=True) +        duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000) +        description = xpath_text(doc, 'ABSTRACT') +        thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') +        createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) + +        quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content') +        formats = [] +        for quality in quality_options: +            formats.append({ +                'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), +                'height': int_or_none(quality.attrib.get('height')), +                'width': int_or_none(quality.attrib.get('width')), +                'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), +            }) +        self._sort_formats(formats) + +        return { +            'id': video_id, +            'title': video_title, +            'description': description, +            'duration': duration, +            'formats': formats, +            'thumbnail': thumbnail, +            'timestamp': createtion_time, +        } diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/leeco.py index 9fd494c29..df47e88ba 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/leeco.py @@ -1,36 +1,39 @@  # coding: utf-8  from __future__ import unicode_literals +import base64  import datetime +import hashlib  import re  import time -import base64 -import hashlib  from .common import InfoExtractor  from ..compat import ( -    compat_urllib_parse,      compat_ord,      compat_str, +    compat_urllib_parse,  )  from ..utils import (      determine_ext, +    encode_data_uri,      ExtractorError, +    int_or_none, +    orderedSet,      parse_iso8601,      sanitized_Request, -    int_or_none,      str_or_none, -    encode_data_uri,      url_basename,  ) -class LetvIE(InfoExtractor): +class LeIE(InfoExtractor):      IE_DESC = '乐视网' -    _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P<id>\d+).html' +    _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' + +    _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html'      _TESTS = [{ -        'url': 'http://www.letv.com/ptv/vplay/22005890.html', +        'url': 'http://www.le.com/ptv/vplay/22005890.html',          'md5': 'edadcfe5406976f42f9f266057ee5e40',          'info_dict': {              'id': '22005890', @@ -42,7 +45,7 @@ class LetvIE(InfoExtractor):              'hls_prefer_native': True,          },      }, { -        'url': 'http://www.letv.com/ptv/vplay/1415246.html', +        'url': 'http://www.le.com/ptv/vplay/1415246.html',          'info_dict': {              'id': '1415246',              'ext': 'mp4', @@ -54,7 +57,7 @@ class LetvIE(InfoExtractor):          },      }, {          'note': 'This video is available only in Mainland China, thus a proxy is needed', -        'url': 'http://www.letv.com/ptv/vplay/1118082.html', +        'url': 'http://www.le.com/ptv/vplay/1118082.html',          'md5': '2424c74948a62e5f31988438979c5ad1',          'info_dict': {              'id': '1118082', @@ -116,10 +119,10 @@ class LetvIE(InfoExtractor):              'splatid': 101,              'format': 1,              'tkey': self.calc_time_key(int(time.time())), -            'domain': 'www.letv.com' +            'domain': 'www.le.com'          }          play_json_req = sanitized_Request( -            'http://api.letv.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params) +            'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse.urlencode(params)          )          cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')          if cn_verification_proxy: @@ -192,26 +195,51 @@ class LetvIE(InfoExtractor):          } -class LetvTvIE(InfoExtractor): -    _VALID_URL = r'http://www.letv.com/tv/(?P<id>\d+).html' +class LePlaylistIE(InfoExtractor): +    _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' +      _TESTS = [{ -        'url': 'http://www.letv.com/tv/46177.html', +        'url': 'http://www.le.com/tv/46177.html',          'info_dict': {              'id': '46177',              'title': '美人天下',              'description': 'md5:395666ff41b44080396e59570dbac01c'          },          'playlist_count': 35 +    }, { +        'url': 'http://tv.le.com/izt/wuzetian/index.html', +        'info_dict': { +            'id': 'wuzetian', +            'title': '武媚娘传奇', +            'description': 'md5:e12499475ab3d50219e5bba00b3cb248' +        }, +        # This playlist contains some extra videos other than the drama itself +        'playlist_mincount': 96 +    }, { +        'url': 'http://tv.le.com/pzt/lswjzzjc/index.shtml', +        # This series is moved to http://www.le.com/tv/10005297.html +        'only_matching': True, +    }, { +        'url': 'http://www.le.com/comic/92063.html', +        'only_matching': True, +    }, { +        'url': 'http://list.le.com/listn/c1009_sc532002_d2_p1_o1.html', +        'only_matching': True,      }] +    @classmethod +    def suitable(cls, url): +        return False if LeIE.suitable(url) else super(LePlaylistIE, cls).suitable(url) +      def _real_extract(self, url):          playlist_id = self._match_id(url)          page = self._download_webpage(url, playlist_id) -        media_urls = list(set(re.findall( -            r'http://www.letv.com/ptv/vplay/\d+.html', page))) -        entries = [self.url_result(media_url, ie='Letv') -                   for media_url in media_urls] +        # Currently old domain names are still used in playlists +        media_ids = orderedSet(re.findall( +            r'<a[^>]+href="http://www\.letv\.com/ptv/vplay/(\d+)\.html', page)) +        entries = [self.url_result(LeIE._URL_TEMPLATE % media_id, ie='Le') +                   for media_id in media_ids]          title = self._html_search_meta('keywords', page,                                         fatal=False).split(',')[0] @@ -221,31 +249,9 @@ class LetvTvIE(InfoExtractor):                                      playlist_description=description) -class LetvPlaylistIE(LetvTvIE): -    _VALID_URL = r'http://tv.letv.com/[a-z]+/(?P<id>[a-z]+)/index.s?html' -    _TESTS = [{ -        'url': 'http://tv.letv.com/izt/wuzetian/index.html', -        'info_dict': { -            'id': 'wuzetian', -            'title': '武媚娘传奇', -            'description': 'md5:e12499475ab3d50219e5bba00b3cb248' -        }, -        # This playlist contains some extra videos other than the drama itself -        'playlist_mincount': 96 -    }, { -        'url': 'http://tv.letv.com/pzt/lswjzzjc/index.shtml', -        'info_dict': { -            'id': 'lswjzzjc', -            # The title should be "劲舞青春", but I can't find a simple way to -            # determine the playlist title -            'title': '乐视午间自制剧场', -            'description': 'md5:b1eef244f45589a7b5b1af9ff25a4489' -        }, -        'playlist_mincount': 7 -    }] - -  class LetvCloudIE(InfoExtractor): +    # Most of *.letv.com is changed to *.le.com on 2016/01/02 +    # but yuntv.letv.com is kept, so also keep the extractor name      IE_DESC = '乐视云'      _VALID_URL = r'https?://yuntv\.letv\.com/bcloud.html\?.+' @@ -326,7 +332,7 @@ class LetvCloudIE(InfoExtractor):              formats.append({                  'url': url,                  'ext': determine_ext(decoded_url), -                'format_id': int_or_none(play_url.get('vtype')), +                'format_id': str_or_none(play_url.get('vtype')),                  'format_note': str_or_none(play_url.get('definition')),                  'width': int_or_none(play_url.get('vwidth')),                  'height': int_or_none(play_url.get('vheight')), diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index f8cbca7b3..a8fd639cc 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -20,18 +20,18 @@ class LifeNewsIE(InfoExtractor):      _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)'      _TESTS = [{ -        'url': 'http://lifenews.ru/news/126342', -        'md5': 'e1b50a5c5fb98a6a544250f2e0db570a', +        # single video embedded via video/source +        'url': 'http://lifenews.ru/news/98736', +        'md5': '77c95eaefaca216e32a76a343ad89d23',          'info_dict': { -            'id': '126342', +            'id': '98736',              'ext': 'mp4', -            'title': 'МВД разыскивает мужчин, оставивших в IKEA сумку с автоматом', -            'description': 'Камеры наблюдения гипермаркета зафиксировали троих мужчин, спрятавших оружейный арсенал в камере хранения.', -            'thumbnail': 're:http://.*\.jpg', -            'upload_date': '20140130', +            'title': 'Мужчина нашел дома архив оборонного завода', +            'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', +            'upload_date': '20120805',          }      }, { -        # video in <iframe> +        # single video embedded via iframe          'url': 'http://lifenews.ru/news/152125',          'md5': '77d19a6f0886cd76bdbf44b4d971a273',          'info_dict': { @@ -42,15 +42,33 @@ class LifeNewsIE(InfoExtractor):              'upload_date': '20150402',          }      }, { +        # two videos embedded via iframe          'url': 'http://lifenews.ru/news/153461', -        'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',          'info_dict': {              'id': '153461', -            'ext': 'mp4',              'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',              'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',              'upload_date': '20150505', -        } +        }, +        'playlist': [{ +            'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', +            'info_dict': { +                'id': '153461-video1', +                'ext': 'mp4', +                'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)', +                'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', +                'upload_date': '20150505', +            }, +        }, { +            'md5': 'ebb3bf3b1ce40e878d0d628e93eb0322', +            'info_dict': { +                'id': '153461-video2', +                'ext': 'mp4', +                'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)', +                'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', +                'upload_date': '20150505', +            }, +        }],      }, {          'url': 'http://lifenews.ru/video/13035',          'only_matching': True, @@ -65,10 +83,14 @@ class LifeNewsIE(InfoExtractor):              'http://lifenews.ru/%s/%s' % (section, video_id),              video_id, 'Downloading page') -        videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage) -        iframe_link = self._html_search_regex( -            '<iframe[^>]+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None) -        if not videos and not iframe_link: +        video_urls = re.findall( +            r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) + +        iframe_links = re.findall( +            r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']', +            webpage) + +        if not video_urls and not iframe_links:              raise ExtractorError('No media links available for %s' % video_id)          title = remove_end( @@ -95,31 +117,44 @@ class LifeNewsIE(InfoExtractor):              'upload_date': upload_date,          } -        def make_entry(video_id, media, video_number=None): +        def make_entry(video_id, video_url, index=None):              cur_info = dict(common_info)              cur_info.update({ -                'id': video_id, -                'url': media[1], -                'thumbnail': media[0], -                'title': title if video_number is None else '%s-video%s' % (title, video_number), +                'id': video_id if not index else '%s-video%s' % (video_id, index), +                'url': video_url, +                'title': title if not index else '%s (Видео %s)' % (title, index),              })              return cur_info -        if iframe_link: -            iframe_link = self._proto_relative_url(iframe_link, 'http:') -            cur_info = dict(common_info) -            cur_info.update({ -                '_type': 'url_transparent', -                'id': video_id, -                'title': title, -                'url': iframe_link, -            }) +        def make_video_entry(video_id, video_url, index=None): +            video_url = compat_urlparse.urljoin(url, video_url) +            return make_entry(video_id, video_url, index) + +        def make_iframe_entry(video_id, video_url, index=None): +            video_url = self._proto_relative_url(video_url, 'http:') +            cur_info = make_entry(video_id, video_url, index) +            cur_info['_type'] = 'url_transparent'              return cur_info -        if len(videos) == 1: -            return make_entry(video_id, videos[0]) -        else: -            return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)] +        if len(video_urls) == 1 and not iframe_links: +            return make_video_entry(video_id, video_urls[0]) + +        if len(iframe_links) == 1 and not video_urls: +            return make_iframe_entry(video_id, iframe_links[0]) + +        entries = [] + +        if video_urls: +            for num, video_url in enumerate(video_urls, 1): +                entries.append(make_video_entry(video_id, video_url, num)) + +        if iframe_links: +            for num, iframe_link in enumerate(iframe_links, len(video_urls) + 1): +                entries.append(make_iframe_entry(video_id, iframe_link, num)) + +        playlist = common_info.copy() +        playlist.update(self.playlist_result(entries, video_id, title, description)) +        return playlist  class LifeEmbedIE(InfoExtractor): diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 38fb3d9e4..988436226 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -64,7 +64,7 @@ class LivestreamIE(InfoExtractor):      def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):          base_ele = find_xpath_attr(              smil, self._xpath_ns('.//meta', namespace), 'name', 'httpBase') -        base = base_ele.get('content') if base_ele else 'http://livestreamvod-f.akamaihd.net/' +        base = base_ele.get('content') if base_ele is not None else 'http://livestreamvod-f.akamaihd.net/'          formats = []          video_nodes = smil.findall(self._xpath_ns('.//video', namespace)) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index a126f5054..3b21fbd4d 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( +    compat_urlparse, +    compat_urllib_parse_unquote, +)  from ..utils import (      determine_ext,      ExtractorError, @@ -87,7 +90,7 @@ class NRKIE(InfoExtractor):  class NRKPlaylistIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P<id>[^/]+)' +    _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)'      _TESTS = [{          'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', @@ -126,6 +129,37 @@ class NRKPlaylistIE(InfoExtractor):              entries, playlist_id, playlist_title, playlist_description) +class NRKSkoleIE(InfoExtractor): +    IE_DESC = 'NRK Skole' +    _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/klippdetalj?.*\btopic=(?P<id>[^/?#&]+)' + +    _TESTS = [{ +        'url': 'http://nrk.no/skole/klippdetalj?topic=nrk:klipp/616532', +        'md5': '04cd85877cc1913bce73c5d28a47e00f', +        'info_dict': { +            'id': '6021', +            'ext': 'flv', +            'title': 'Genetikk og eneggede tvillinger', +            'description': 'md5:3aca25dcf38ec30f0363428d2b265f8d', +            'duration': 399, +        }, +    }, { +        'url': 'http://www.nrk.no/skole/klippdetalj?topic=nrk%3Aklipp%2F616532#embed', +        'only_matching': True, +    }, { +        'url': 'http://www.nrk.no/skole/klippdetalj?topic=urn:x-mediadb:21379', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = compat_urllib_parse_unquote(self._match_id(url)) + +        webpage = self._download_webpage(url, video_id) + +        nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id') +        return self.url_result('nrk:%s' % nrk_id) + +  class NRKTVIE(InfoExtractor):      IE_DESC = 'NRK TV and NRK Radio'      _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py index b1b8800b9..99979ebe1 100644 --- a/youtube_dl/extractor/revision3.py +++ b/youtube_dl/extractor/revision3.py @@ -19,7 +19,7 @@ class Revision3IE(InfoExtractor):          'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',          'md5': 'd94a72d85d0a829766de4deb8daaf7df',          'info_dict': { -            'id': '73034', +            'id': '71089',              'display_id': 'technobuffalo/5-google-predictions-for-2016',              'ext': 'webm',              'title': '5 Google Predictions for 2016', @@ -31,6 +31,7 @@ class Revision3IE(InfoExtractor):              'uploader_id': 'technobuffalo',          }      }, { +        # Show          'url': 'http://testtube.com/brainstuff',          'info_dict': {              'id': '251', @@ -41,7 +42,7 @@ class Revision3IE(InfoExtractor):      }, {          'url': 'https://testtube.com/dnews/5-weird-ways-plants-can-eat-animals?utm_source=FB&utm_medium=DNews&utm_campaign=DNewsSocial',          'info_dict': { -            'id': '60163', +            'id': '58227',              'display_id': 'dnews/5-weird-ways-plants-can-eat-animals',              'duration': 275,              'ext': 'webm', @@ -52,18 +53,72 @@ class Revision3IE(InfoExtractor):              'uploader': 'DNews',              'uploader_id': 'dnews',          }, +    }, { +        'url': 'http://testtube.com/tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', +        'info_dict': { +            'id': '71618', +            'ext': 'mp4', +            'display_id': 'tt-editors-picks/the-israel-palestine-conflict-explained-in-ten-min', +            'title': 'The Israel-Palestine Conflict Explained in Ten Minutes', +            'description': 'If you\'d like to learn about the struggle between Israelis and Palestinians, this video is a great place to start', +            'uploader': 'Editors\' Picks', +            'uploader_id': 'tt-editors-picks', +            'timestamp': 1453309200, +            'upload_date': '20160120', +        }, +        'add_ie': ['Youtube'], +    }, { +        # Tag +        'url': 'http://testtube.com/tech-news', +        'info_dict': { +            'id': '21018', +            'title': 'tech news', +        }, +        'playlist_mincount': 9,      }]      _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'      _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'      def _real_extract(self, url):          domain, display_id = re.match(self._VALID_URL, url).groups() +        site = domain.split('.')[0]          page_info = self._download_json(              self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id) -        if page_info['data']['type'] == 'episode': -            episode_data = page_info['data'] -            video_id = compat_str(episode_data['video']['data']['id']) +        page_data = page_info['data'] +        page_type = page_data['type'] +        if page_type in ('episode', 'embed'): +            show_data = page_data['show']['data'] +            page_id = compat_str(page_data['id']) +            video_id = compat_str(page_data['video']['data']['id']) + +            preference = qualities(['mini', 'small', 'medium', 'large']) +            thumbnails = [{ +                'url': image_url, +                'id': image_id, +                'preference': preference(image_id) +            } for image_id, image_url in page_data.get('images', {}).items()] + +            info = { +                'id': page_id, +                'display_id': display_id, +                'title': unescapeHTML(page_data['name']), +                'description': unescapeHTML(page_data.get('summary')), +                'timestamp': parse_iso8601(page_data.get('publishTime'), ' '), +                'author': page_data.get('author'), +                'uploader': show_data.get('name'), +                'uploader_id': show_data.get('slug'), +                'thumbnails': thumbnails, +                'extractor_key': site, +            } + +            if page_type == 'embed': +                info.update({ +                    '_type': 'url_transparent', +                    'url': page_data['video']['data']['embed'], +                }) +                return info +              video_data = self._download_json(                  'http://revision3.com/api/getPlaylist.json?api_key=%s&codecs=h264,vp8,theora&video_id=%s' % (self._API_KEY, video_id),                  video_id)['items'][0] @@ -84,36 +139,30 @@ class Revision3IE(InfoExtractor):                          })              self._sort_formats(formats) -            preference = qualities(['mini', 'small', 'medium', 'large']) -            thumbnails = [{ -                'url': image_url, -                'id': image_id, -                'preference': preference(image_id) -            } for image_id, image_url in video_data.get('images', {}).items()] - -            return { -                'id': video_id, -                'display_id': display_id, +            info.update({                  'title': unescapeHTML(video_data['title']),                  'description': unescapeHTML(video_data.get('summary')), -                'timestamp': parse_iso8601(episode_data.get('publishTime'), ' '), -                'author': episode_data.get('author'),                  'uploader': video_data.get('show', {}).get('name'),                  'uploader_id': video_data.get('show', {}).get('slug'),                  'duration': int_or_none(video_data.get('duration')), -                'thumbnails': thumbnails,                  'formats': formats, -            } +            }) +            return info          else: -            show_data = page_info['show']['data'] +            list_data = page_info[page_type]['data']              episodes_data = page_info['episodes']['data']              num_episodes = page_info['meta']['totalEpisodes']              processed_episodes = 0              entries = []              page_num = 1              while True: -                entries.extend([self.url_result( -                    'http://%s/%s/%s' % (domain, display_id, episode['slug'])) for episode in episodes_data]) +                entries.extend([{ +                    '_type': 'url', +                    'url': 'http://%s%s' % (domain, episode['path']), +                    'id': compat_str(episode['id']), +                    'ie_key': 'Revision3', +                    'extractor_key': site, +                } for episode in episodes_data])                  processed_episodes += len(episodes_data)                  if processed_episodes == num_episodes:                      break @@ -123,5 +172,5 @@ class Revision3IE(InfoExtractor):                      display_id)['episodes']['data']              return self.playlist_result( -                entries, compat_str(show_data['id']), -                show_data.get('name'), show_data.get('summary')) +                entries, compat_str(list_data['id']), +                list_data.get('name'), list_data.get('summary')) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 93d871571..9a57b49df 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -22,6 +22,7 @@ from ..utils import (      unsmuggle_url,      xpath_with_ns,      mimetype2ext, +    find_xpath_attr,  )  default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -31,15 +32,11 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})  class ThePlatformBaseIE(InfoExtractor):      def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):          meta = self._download_xml(smil_url, video_id, note=note) -        try: -            error_msg = next( -                n.attrib['abstract'] -                for n in meta.findall(_x('.//smil:ref')) -                if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') -        except StopIteration: -            pass -        else: -            raise ExtractorError(error_msg, expected=True) +        error_element = find_xpath_attr( +            meta, _x('.//smil:ref'), 'src', +            'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') +        if error_element is not None: +            raise ExtractorError(error_element.attrib['abstract'], expected=True)          formats = self._parse_smil_formats(              meta, smil_url, video_id, namespace=default_ns, diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 69882da63..8639293e3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -17,6 +17,7 @@ from ..utils import (      encode_dict,      ExtractorError,      int_or_none, +    orderedSet,      parse_duration,      parse_iso8601,      sanitized_Request, @@ -281,17 +282,36 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):          entries = []          offset = 0          limit = self._PAGE_LIMIT +        broken_paging_detected = False +        counter_override = None          for counter in itertools.count(1):              response = self._download_json(                  self._PLAYLIST_URL % (channel_id, offset, limit), -                channel_id, 'Downloading %s videos JSON page %d' % (self._PLAYLIST_TYPE, counter)) +                channel_id, +                'Downloading %s videos JSON page %s' +                % (self._PLAYLIST_TYPE, counter_override or counter))              page_entries = self._extract_playlist_page(response)              if not page_entries:                  break +            total = int_or_none(response.get('_total')) +            # Since the beginning of March 2016 twitch's paging mechanism +            # is completely broken on the twitch side. It simply ignores +            # a limit and returns the whole offset number of videos. +            # Working around by just requesting all videos at once. +            if not broken_paging_detected and total and len(page_entries) > limit: +                self.report_warning( +                    'Twitch paging is broken on twitch side, requesting all videos at once', +                    channel_id) +                broken_paging_detected = True +                offset = total +                counter_override = '(all at once)' +                continue              entries.extend(page_entries) +            if broken_paging_detected or total and len(page_entries) >= total: +                break              offset += limit          return self.playlist_result( -            [self.url_result(entry) for entry in set(entries)], +            [self.url_result(entry) for entry in orderedSet(entries)],              channel_id, channel_name)      def _extract_playlist_page(self, response): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 5d2b5ec35..e70b2ab3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -10,7 +10,6 @@ from ..utils import (      remove_end,      int_or_none,      ExtractorError, -    sanitized_Request,  ) @@ -22,7 +21,7 @@ class TwitterBaseIE(InfoExtractor):  class TwitterCardIE(TwitterBaseIE):      IE_NAME = 'twitter:card' -    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos/tweet)/(?P<id>\d+)'      _TESTS = [          {              'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -30,7 +29,7 @@ class TwitterCardIE(TwitterBaseIE):              'info_dict': {                  'id': '560070183650213889',                  'ext': 'mp4', -                'title': 'TwitterCard', +                'title': 'Twitter Card',                  'thumbnail': 're:^https?://.*\.jpg$',                  'duration': 30.033,              } @@ -41,7 +40,7 @@ class TwitterCardIE(TwitterBaseIE):              'info_dict': {                  'id': '623160978427936768',                  'ext': 'mp4', -                'title': 'TwitterCard', +                'title': 'Twitter Card',                  'thumbnail': 're:^https?://.*\.jpg',                  'duration': 80.155,              }, @@ -72,63 +71,102 @@ class TwitterCardIE(TwitterBaseIE):                  'title': 'Vine by ArsenalTerje',              },              'add_ie': ['Vine'], -        } +        }, { +            'url': 'https://twitter.com/i/videos/tweet/705235433198714880', +            'md5': '3846d0a07109b5ab622425449b59049d', +            'info_dict': { +                'id': '705235433198714880', +                'ext': 'mp4', +                'title': 'Twitter web player', +                'thumbnail': 're:^https?://.*\.jpg', +            }, +        },      ]      def _real_extract(self, url):          video_id = self._match_id(url) -        # Different formats served for different User-Agents -        USER_AGENTS = [ -            'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',  # mp4 -            'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',  # webm -        ] -          config = None          formats = [] -        for user_agent in USER_AGENTS: -            request = sanitized_Request(url) -            request.add_header('User-Agent', user_agent) -            webpage = self._download_webpage(request, video_id) - -            iframe_url = self._html_search_regex( -                r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', -                webpage, 'video iframe', default=None) -            if iframe_url: -                return self.url_result(iframe_url) - -            config = self._parse_json(self._html_search_regex( -                r'data-player-config="([^"]+)"', webpage, 'data player config'), -                video_id) -            if 'playlist' not in config: -                if 'vmapUrl' in config: -                    formats.append({ -                        'url': self._get_vmap_video_url(config['vmapUrl'], video_id), -                    }) -                    break   # same video regardless of UA -                continue - -            video_url = config['playlist'][0]['source'] +        duration = None -            f = { -                'url': video_url, -            } +        webpage = self._download_webpage(url, video_id) + +        iframe_url = self._html_search_regex( +            r'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', +            webpage, 'video iframe', default=None) +        if iframe_url: +            return self.url_result(iframe_url) +        config = self._parse_json(self._html_search_regex( +            r'data-(?:player-)?config="([^"]+)"', webpage, 'data player config'), +            video_id) + +        def _search_dimensions_in_video_url(a_format, video_url):              m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)              if m: -                f.update({ +                a_format.update({                      'width': int(m.group('width')),                      'height': int(m.group('height')),                  }) + +        playlist = config.get('playlist') +        if playlist: +            video_url = playlist[0]['source'] + +            f = { +                'url': video_url, +            } + +            _search_dimensions_in_video_url(f, video_url) +              formats.append(f) + +        vmap_url = config.get('vmapUrl') or config.get('vmap_url') +        if vmap_url: +            formats.append({ +                'url': self._get_vmap_video_url(vmap_url, video_id), +            }) + +        media_info = None + +        for entity in config.get('status', {}).get('entities', []): +            if 'mediaInfo' in entity: +                media_info = entity['mediaInfo'] + +        if media_info: +            for media_variant in media_info['variants']: +                media_url = media_variant['url'] +                if media_url.endswith('.m3u8'): +                    formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) +                elif media_url.endswith('.mpd'): +                    formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) +                else: +                    vbr = int_or_none(media_variant.get('bitRate'), scale=1000) +                    a_format = { +                        'url': media_url, +                        'format_id': 'http-%d' % vbr if vbr else 'http', +                        'vbr': vbr, +                    } +                    # Reported bitRate may be zero +                    if not a_format['vbr']: +                        del a_format['vbr'] + +                    _search_dimensions_in_video_url(a_format, media_url) + +                    formats.append(a_format) + +            duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) +          self._sort_formats(formats) -        thumbnail = config.get('posterImageUrl') -        duration = float_or_none(config.get('duration')) +        title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') +        thumbnail = config.get('posterImageUrl') or config.get('image_src') +        duration = float_or_none(config.get('duration')) or duration          return {              'id': video_id, -            'title': 'TwitterCard', +            'title': title,              'thumbnail': thumbnail,              'duration': duration,              'formats': formats, @@ -142,7 +180,6 @@ class TwitterIE(InfoExtractor):      _TESTS = [{          'url': 'https://twitter.com/freethenipple/status/643211948184596480', -        # MD5 checksums are different in different places          'info_dict': {              'id': '643211948184596480',              'ext': 'mp4', @@ -153,6 +190,9 @@ class TwitterIE(InfoExtractor):              'uploader': 'FREE THE NIPPLE',              'uploader_id': 'freethenipple',          }, +        'params': { +            'skip_download': True,  # requires ffmpeg +        },      }, {          'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',          'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', @@ -177,6 +217,36 @@ class TwitterIE(InfoExtractor):              'uploader_id': 'starwars',              'uploader': 'Star Wars',          }, +    }, { +        'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', +        'info_dict': { +            'id': '705235433198714880', +            'ext': 'mp4', +            'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', +            'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', +            'uploader_id': 'BTNBrentYarina', +            'uploader': 'Brent Yarina', +        }, +        'params': { +            # The same video as https://twitter.com/i/videos/tweet/705235433198714880 +            # Test case of TwitterCardIE +            'skip_download': True, +        }, +    }, { +        'url': 'https://twitter.com/jaydingeer/status/700207533655363584', +        'md5': '', +        'info_dict': { +            'id': '700207533655363584', +            'ext': 'mp4', +            'title': 'jay - BEAT PROD: @suhmeduh #Damndaniel', +            'description': 'jay on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', +            'thumbnail': 're:^https?://.*\.jpg', +            'uploader': 'jay', +            'uploader_id': 'jaydingeer', +        }, +        'params': { +            'skip_download': True,  # requires ffmpeg +        },      }]      def _real_extract(self, url): @@ -234,6 +304,15 @@ class TwitterIE(InfoExtractor):              })              return info +        if 'class="PlayableMedia' in webpage: +            info.update({ +                '_type': 'url_transparent', +                'ie_key': 'TwitterCard', +                'url': '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid), +            }) + +            return info +          raise ExtractorError('There\'s no video in this tweet.') diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 14e945d49..e148b1ef5 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -20,6 +20,7 @@ class VGTVIE(XstreamIE):          'aftenbladet.no/tv': 'satv',          'fvn.no/fvntv': 'fvntv',          'aftenposten.no/webtv': 'aptv', +        'ap.vgtv.no/webtv': 'aptv',      }      _APP_NAME_TO_VENDOR = { @@ -35,7 +36,7 @@ class VGTVIE(XstreamIE):                      (?P<host>                          %s                      ) -                    / +                    /?                      (?:                          \#!/(?:video|live)/|                          embed?.*id= @@ -107,19 +108,27 @@ class VGTVIE(XstreamIE):              'md5': 'fd828cd29774a729bf4d4425fe192972',              'info_dict': {                  'id': '21039', -                'ext': 'mov', +                'ext': 'mp4',                  'title': 'TRAILER: «SWEATSHOP» - I can´t take any more',                  'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238',                  'duration': 66,                  'timestamp': 1417002452,                  'upload_date': '20141126',                  'view_count': int, -            } +            }, +            'params': { +                # m3u8 download +                'skip_download': True, +            },          },          {              'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien',              'only_matching': True,          }, +        { +            'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', +            'only_matching': True, +        },      ]      def _real_extract(self, url): @@ -144,8 +153,6 @@ class VGTVIE(XstreamIE):          if len(video_id) == 5:              if appname == 'bttv':                  info = self._extract_video_info('btno', video_id) -            elif appname == 'aptv': -                info = self._extract_video_info('ap', video_id)          streams = data['streamUrls']          stream_type = data.get('streamType') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 3049dffb6..9f282a1da 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -93,6 +93,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",                  'description': 'md5:2d3305bad981a06ff79f027f19865021',                  'upload_date': '20121220', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user7108434',                  'uploader_id': 'user7108434',                  'uploader': 'Filippo Valsorda',                  'duration': 10, @@ -105,6 +106,7 @@ class VimeoIE(VimeoBaseInfoExtractor):              'info_dict': {                  'id': '68093876',                  'ext': 'mp4', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/openstreetmapus',                  'uploader_id': 'openstreetmapus',                  'uploader': 'OpenStreetMap US',                  'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', @@ -121,6 +123,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'ext': 'mp4',                  'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software 2012',                  'uploader': 'The BLN & Business of Software', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/theblnbusinessofsoftware',                  'uploader_id': 'theblnbusinessofsoftware',                  'duration': 3610,                  'description': None, @@ -135,6 +138,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'ext': 'mp4',                  'title': 'youtube-dl password protected test video',                  'upload_date': '20130614', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user18948128',                  'uploader_id': 'user18948128',                  'uploader': 'Jaime Marquínez Ferrándiz',                  'duration': 10, @@ -154,6 +158,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'ext': 'mp4',                  'title': 'Key & Peele: Terrorist Interrogation',                  'description': 'md5:8678b246399b070816b12313e8b4eb5c', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/atencio',                  'uploader_id': 'atencio',                  'uploader': 'Peter Atencio',                  'upload_date': '20130927', @@ -169,6 +174,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'title': 'The New Vimeo Player (You Know, For Videos)',                  'description': 'md5:2ec900bf97c3f389378a96aee11260ea',                  'upload_date': '20131015', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/staff',                  'uploader_id': 'staff',                  'uploader': 'Vimeo Staff',                  'duration': 62, @@ -183,6 +189,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'ext': 'mp4',                  'title': 'Pier Solar OUYA Official Trailer',                  'uploader': 'Tulio Gonçalves', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user28849593',                  'uploader_id': 'user28849593',              },          }, @@ -195,6 +202,7 @@ class VimeoIE(VimeoBaseInfoExtractor):                  'ext': 'mp4',                  'title': 'FOX CLASSICS - Forever Classic ID - A Full Minute',                  'uploader': 'The DMCI', +                'uploader_url': 're:https?://(?:www\.)?vimeo\.com/dmci',                  'uploader_id': 'dmci',                  'upload_date': '20111220',                  'description': 'md5:ae23671e82d05415868f7ad1aec21147', @@ -370,9 +378,10 @@ class VimeoIE(VimeoBaseInfoExtractor):          # Extract title          video_title = config['video']['title'] -        # Extract uploader and uploader_id -        video_uploader = config['video']['owner']['name'] -        video_uploader_id = config['video']['owner']['url'].split('/')[-1] if config['video']['owner']['url'] else None +        # Extract uploader, uploader_url and uploader_id +        video_uploader = config['video'].get('owner', {}).get('name') +        video_uploader_url = config['video'].get('owner', {}).get('url') +        video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None          # Extract video thumbnail          video_thumbnail = config['video'].get('thumbnail') @@ -473,6 +482,7 @@ class VimeoIE(VimeoBaseInfoExtractor):          return {              'id': video_id,              'uploader': video_uploader, +            'uploader_url': video_uploader_url,              'uploader_id': video_uploader_id,              'upload_date': video_upload_date,              'title': video_title, diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index fdb16d91c..41061dd31 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -35,7 +35,8 @@ class WistiaIE(InfoExtractor):          formats = []          thumbnails = [] -        for atype, a in data['assets'].items(): +        for a in data['assets']: +            atype = a.get('type')              if atype == 'still':                  thumbnails.append({                      'url': a['url'], diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index d3cc1a29f..e699e663f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,13 +10,27 @@ from ..compat import (      compat_urllib_parse,  )  from ..utils import ( +    ExtractorError,      int_or_none,      float_or_none,      sanitized_Request,  ) -class YandexMusicTrackIE(InfoExtractor): +class YandexMusicBaseIE(InfoExtractor): +    @staticmethod +    def _handle_error(response): +        error = response.get('error') +        if error: +            raise ExtractorError(error, expected=True) + +    def _download_json(self, *args, **kwargs): +        response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) +        self._handle_error(response) +        return response + + +class YandexMusicTrackIE(YandexMusicBaseIE):      IE_NAME = 'yandexmusic:track'      IE_DESC = 'Яндекс.Музыка - Трек'      _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)' @@ -73,7 +87,7 @@ class YandexMusicTrackIE(InfoExtractor):          return self._get_track_info(track) -class YandexMusicPlaylistBaseIE(InfoExtractor): +class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):      def _build_playlist(self, tracks):          return [              self.url_result( diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index b29baafc4..1124fe6c2 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -75,7 +75,7 @@ class YouPornIE(InfoExtractor):          links = []          sources = self._search_regex( -            r'sources\s*:\s*({.+?})', webpage, 'sources', default=None) +            r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)          if sources:              for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):                  links.append(link) @@ -101,8 +101,9 @@ class YouPornIE(InfoExtractor):              }              # Video URL's path looks like this:              #  /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 +            #  /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4              # We will benefit from it by extracting some metadata -            mobj = re.search(r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url) +            mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+/', video_url)              if mobj:                  height = int(mobj.group('height'))                  bitrate = int(mobj.group('bitrate')) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ec90c2111..27e67feb4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals  import itertools  import json  import os.path +import random  import re  import time  import traceback @@ -382,7 +383,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'title': 'youtube-dl test video "\'/\\ä↭𝕐',                  'uploader': 'Philipp Hagemeister',                  'uploader_id': 'phihag', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',                  'upload_date': '20121002', +                'license': 'Standard YouTube License',                  'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',                  'categories': ['Science & Technology'],                  'tags': ['youtube-dl'], @@ -401,12 +404,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'upload_date': '20120506',                  'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',                  'alt_title': 'I Love It (feat. Charli XCX)', -                'description': 'md5:782e8651347686cba06e58f71ab51773', +                'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',                  'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',                           'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',                           'iconic ep', 'iconic', 'love', 'it'],                  'uploader': 'Icona Pop',                  'uploader_id': 'IconaPop', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop', +                'license': 'Standard YouTube License',                  'creator': 'Icona Pop',              }          }, @@ -422,6 +427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:64249768eec3bc4276236606ea996373',                  'uploader': 'justintimberlakeVEVO',                  'uploader_id': 'justintimberlakeVEVO', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', +                'license': 'Standard YouTube License',                  'creator': 'Justin Timberlake',                  'age_limit': 18,              } @@ -437,6 +444,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',                  'uploader': 'SET India',                  'uploader_id': 'setindia', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia', +                'license': 'Standard YouTube License',                  'age_limit': 18,              }          }, @@ -449,7 +458,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'title': 'youtube-dl test video "\'/\\ä↭𝕐',                  'uploader': 'Philipp Hagemeister',                  'uploader_id': 'phihag', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag',                  'upload_date': '20121002', +                'license': 'Standard YouTube License',                  'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',                  'categories': ['Science & Technology'],                  'tags': ['youtube-dl'], @@ -468,8 +479,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'm4a',                  'upload_date': '20121002',                  'uploader_id': '8KVIDEO', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',                  'description': '',                  'uploader': '8KVIDEO', +                'license': 'Standard YouTube License',                  'title': 'UHDTV TEST 8K VIDEO.mp4'              },              'params': { @@ -488,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader': 'AfrojackVEVO',                  'uploader_id': 'AfrojackVEVO',                  'upload_date': '20131011', +                'license': 'Standard YouTube License',              },              'params': {                  'youtube_include_dash_manifest': True, @@ -506,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader': 'TaylorSwiftVEVO',                  'uploader_id': 'TaylorSwiftVEVO',                  'upload_date': '20140818', +                'license': 'Standard YouTube License',                  'creator': 'Taylor Swift',              },              'params': { @@ -522,6 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'upload_date': '20100909',                  'uploader': 'The Amazing Atheist',                  'uploader_id': 'TheAmazingAtheist', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', +                'license': 'Standard YouTube License',                  'title': 'Burning Everyone\'s Koran',                  'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',              } @@ -536,7 +553,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}',                  'uploader': 'The Witcher',                  'uploader_id': 'WitcherGame', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame',                  'upload_date': '20140605', +                'license': 'Standard YouTube License',                  'age_limit': 18,              },          }, @@ -550,7 +569,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:33765bb339e1b47e7e72b5490139bb41',                  'uploader': 'LloydVEVO',                  'uploader_id': 'LloydVEVO', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',                  'upload_date': '20110629', +                'license': 'Standard YouTube License',                  'age_limit': 18,              },          }, @@ -562,9 +583,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20100430',                  'uploader_id': 'deadmau5', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5',                  'creator': 'deadmau5',                  'description': 'md5:12c56784b8032162bb936a5f76d55360',                  'uploader': 'deadmau5', +                'license': 'Standard YouTube License',                  'title': 'Deadmau5 - Some Chords (HD)',                  'alt_title': 'Some Chords',              }, @@ -580,6 +603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'ext': 'mp4',                  'upload_date': '20150827',                  'uploader_id': 'olympic', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', +                'license': 'Standard YouTube License',                  'description': 'HO09  - Women -  GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',                  'uploader': 'Olympics',                  'title': 'Hockey - Women -  GER-AUS - London 2012 Olympic Games', @@ -597,8 +622,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'stretched_ratio': 16 / 9.,                  'upload_date': '20110310',                  'uploader_id': 'AllenMeow', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow',                  'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',                  'uploader': '孫艾倫', +                'license': 'Standard YouTube License',                  'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',              },          }, @@ -629,7 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:116377fd2963b81ec4ce64b542173306',                  'upload_date': '20150625',                  'uploader_id': 'dorappi2000', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000',                  'uploader': 'dorappi2000', +                'license': 'Standard YouTube License',                  'formats': 'mincount:33',              },          }, @@ -644,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'uploader': 'Airtek',                  'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',                  'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', +                'license': 'Standard YouTube License',                  'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',              },              'params': { @@ -668,6 +698,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      'upload_date': '20150721',                      'uploader': 'Beer Games Beer',                      'uploader_id': 'beergamesbeer', +                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', +                    'license': 'Standard YouTube License',                  },              }, {                  'info_dict': { @@ -678,6 +710,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      'upload_date': '20150721',                      'uploader': 'Beer Games Beer',                      'uploader_id': 'beergamesbeer', +                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', +                    'license': 'Standard YouTube License',                  },              }, {                  'info_dict': { @@ -688,6 +722,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      'upload_date': '20150721',                      'uploader': 'Beer Games Beer',                      'uploader_id': 'beergamesbeer', +                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', +                    'license': 'Standard YouTube License',                  },              }, {                  'info_dict': { @@ -698,6 +734,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      'upload_date': '20150721',                      'uploader': 'Beer Games Beer',                      'uploader_id': 'beergamesbeer', +                    'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', +                    'license': 'Standard YouTube License',                  },              }],              'params': { @@ -731,7 +769,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a',                  'upload_date': '20151119',                  'uploader_id': 'IronSoulElf', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',                  'uploader': 'IronSoulElf', +                'license': 'Standard YouTube License',                  'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan',              },              'params': { @@ -760,6 +800,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          { +            # Video licensed under Creative Commons +            'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', +            'info_dict': { +                'id': 'M4gD1WSo5mA', +                'ext': 'mp4', +                'title': 'md5:e41008789470fc2533a3252216f1c1d1', +                'description': 'md5:a677553cf0840649b731a3024aeff4cc', +                'upload_date': '20150127', +                'uploader_id': 'BerkmanCenter', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', +                'uploader': 'BerkmanCenter', +                'license': 'Creative Commons Attribution license (reuse allowed)', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        { +            # Channel-like uploader_url +            'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', +            'info_dict': { +                'id': 'eQcmzGIKrzg', +                'ext': 'mp4', +                'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', +                'description': 'md5:dda0d780d5a6e120758d1711d062a867', +                'upload_date': '20151119', +                'uploader': 'Bernie 2016', +                'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', +                'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', +                'license': 'Creative Commons Attribution license (reuse allowed)', +            }, +            'params': { +                'skip_download': True, +            }, +        }, +        {              'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY',              'only_matching': True,          } @@ -1046,6 +1122,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              self._downloader.report_warning(err_msg)              return {} +    def _mark_watched(self, video_id, video_info): +        playback_url = video_info.get('videostats_playback_base_url', [None])[0] +        if not playback_url: +            return +        parsed_playback_url = compat_urlparse.urlparse(playback_url) +        qs = compat_urlparse.parse_qs(parsed_playback_url.query) + +        # cpn generation algorithm is reverse engineered from base.js. +        # In fact it works even with dummy cpn. +        CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' +        cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + +        qs.update({ +            'ver': ['2'], +            'cpn': [cpn], +        }) +        playback_url = compat_urlparse.urlunparse( +            parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + +        self._download_webpage( +            playback_url, video_id, 'Marking watched', +            'Unable to mark watched', fatal=False) +      @classmethod      def extract_id(cls, url):          mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1272,9 +1371,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # uploader_id          video_uploader_id = None -        mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage) +        video_uploader_url = None +        mobj = re.search( +            r'<link itemprop="url" href="(?P<uploader_url>https?://www.youtube.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', +            video_webpage)          if mobj is not None: -            video_uploader_id = mobj.group(1) +            video_uploader_id = mobj.group('uploader_id') +            video_uploader_url = mobj.group('uploader_url')          else:              self._downloader.report_warning('unable to extract uploader nickname') @@ -1302,6 +1405,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())          upload_date = unified_strdate(upload_date) +        video_license = self._html_search_regex( +            r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li', +            video_webpage, 'license', default=None) +          m_music = re.search(              r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',              video_webpage) @@ -1375,6 +1482,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]              if 'rtmpe%3Dyes' in encoded_url_map:                  raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) +            formats_spec = {} +            fmt_list = video_info.get('fmt_list', [''])[0] +            if fmt_list: +                for fmt in fmt_list.split(','): +                    spec = fmt.split('/') +                    if len(spec) > 1: +                        width_height = spec[1].split('x') +                        if len(width_height) == 2: +                            formats_spec[spec[0]] = { +                                'resolution': spec[1], +                                'width': int_or_none(width_height[0]), +                                'height': int_or_none(width_height[1]), +                            }              formats = []              for url_data_str in encoded_url_map.split(','):                  url_data = compat_parse_qs(url_data_str) @@ -1443,6 +1563,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  }                  if format_id in self._formats:                      dct.update(self._formats[format_id]) +                if format_id in formats_spec: +                    dct.update(formats_spec[format_id])                  # Some itags are not included in DASH manifest thus corresponding formats will                  # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). @@ -1555,11 +1677,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          self._sort_formats(formats) +        self.mark_watched(video_id, video_info) +          return {              'id': video_id,              'uploader': video_uploader,              'uploader_id': video_uploader_id, +            'uploader_url': video_uploader_url,              'upload_date': upload_date, +            'license': video_license,              'creator': video_creator,              'title': video_title,              'alt_title': video_alt_title, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3afa8bb6f..9dd7a8034 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -171,6 +171,14 @@ def parseOpts(overrideArguments=None):          default=False,          help='Do not extract the videos of a playlist, only list them.')      general.add_option( +        '--mark-watched', +        action='store_true', dest='mark_watched', default=False, +        help='Mark videos watched (YouTube only)') +    general.add_option( +        '--no-mark-watched', +        action='store_false', dest='mark_watched', default=False, +        help='Do not mark videos watched (YouTube only)') +    general.add_option(          '--no-color', '--no-colors',          action='store_true', dest='no_color',          default=False, diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 480d48d05..e39ca60aa 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -6,6 +6,7 @@ import sys  import errno  from .common import PostProcessor +from ..compat import compat_os_name  from ..utils import (      check_executable,      hyphenate_date, @@ -73,7 +74,7 @@ class XAttrMetadataPP(PostProcessor):                      raise XAttrMetadataError(e.errno, e.strerror)          except ImportError: -            if os.name == 'nt': +            if compat_os_name == 'nt':                  # Write xattrs to NTFS Alternate Data Streams:                  # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29                  def write_xattr(path, key, value): @@ -168,7 +169,7 @@ class XAttrMetadataPP(PostProcessor):                      'Unable to write extended attributes due to too long values.')              else:                  msg = 'This filesystem doesn\'t support extended attributes. ' -                if os.name == 'nt': +                if compat_os_name == 'nt':                      msg += 'You need to use NTFS.'                  else:                      msg += '(You may have to enable them in your /etc/fstab)' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 606977c58..22a39a0ab 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -160,8 +160,6 @@ if sys.version_info >= (2, 7):      def find_xpath_attr(node, xpath, key, val=None):          """ Find the xpath xpath[@key=val] """          assert re.match(r'^[a-zA-Z_-]+$', key) -        if val: -            assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)          expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))          return node.find(expr)  else: @@ -467,6 +465,10 @@ def encodeFilename(s, for_subprocess=False):      if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:          return s +    # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible +    if sys.platform.startswith('java'): +        return s +      return s.encode(get_subprocess_encoding(), 'ignore') @@ -1217,13 +1219,23 @@ if sys.platform == 'win32':              raise OSError('Unlocking file failed: %r' % ctypes.FormatError())  else: -    import fcntl +    # Some platforms, such as Jython, is missing fcntl +    try: +        import fcntl -    def _lock_file(f, exclusive): -        fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) +        def _lock_file(f, exclusive): +            fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH) -    def _unlock_file(f): -        fcntl.flock(f, fcntl.LOCK_UN) +        def _unlock_file(f): +            fcntl.flock(f, fcntl.LOCK_UN) +    except ImportError: +        UNSUPPORTED_MSG = 'file locking is not supported on this platform' + +        def _lock_file(f, exclusive): +            raise IOError(UNSUPPORTED_MSG) + +        def _unlock_file(f): +            raise IOError(UNSUPPORTED_MSG)  class locked_file(object): @@ -1387,6 +1399,12 @@ def fix_xml_ampersands(xml_str):  def setproctitle(title):      assert isinstance(title, compat_str) + +    # ctypes in Jython is not complete +    # http://bugs.jython.org/issue2148 +    if sys.platform.startswith('java'): +        return +      try:          libc = ctypes.cdll.LoadLibrary('libc.so.6')      except OSError: @@ -1721,6 +1739,15 @@ def urlencode_postdata(*args, **kargs):      return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') +def update_url_query(url, query): +    parsed_url = compat_urlparse.urlparse(url) +    qs = compat_parse_qs(parsed_url.query) +    qs.update(query) +    qs = encode_dict(qs) +    return compat_urlparse.urlunparse(parsed_url._replace( +        query=compat_urllib_parse.urlencode(qs, True))) + +  def encode_dict(d, encoding='utf-8'):      def encode(v):          return v.encode(encoding) if isinstance(v, compat_basestring) else v  | 
